Skip to content

Instantly share code, notes, and snippets.

@sujathakvr
Last active August 29, 2015 14:27
Show Gist options
  • Save sujathakvr/3314036a32916d5cb420 to your computer and use it in GitHub Desktop.
Save sujathakvr/3314036a32916d5cb420 to your computer and use it in GitHub Desktop.
Counting the Frequency of words in large number of text files using Threads in java
Create the following folders
C:\input
C:\MyTestingRoot
C:\output
the folder names can vary which can be passed accordingly in the arguments while running the program.
The input folder contains files like (for e.g)
1.txt which contains Search Home New Image Ask Question Fact
2.txt which contains Fact
3.txt which contains asteroid sourc signific perturb planet among brightest main belt Fact
The data folder which needs to be scanned based on the above txt files will be available in C:\MyTestingRoot
the output generated will be in c:\output.
package com.ourOffice;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
public class CSVGenerator {
static void generateCsvFile(String fileName,HashMap<String , Integer> hMap, int totalWordsCounter) {
try {
FileWriter writer = new FileWriter(fileName, false);
writer.append("Word");
writer.append(',');
writer.append("Count");
writer.append(',');
writer.append("Count/Total Number of words");
writer.append('\n');
Iterator iter = hMap.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry mEntry = (Map.Entry) iter.next();
String key = mEntry.getKey().toString();
String value = mEntry.getValue().toString();
double valueDouble = Double.parseDouble(value);
double totalWordsCounterDouble = totalWordsCounter;
writer.append(key);
writer.append(',');
writer.append(value);
writer.append(',');
// If number of occurrences / Total Words is needed, uncomment the next line and comment the one after next line.
//writer.append(value + "/" + totalWordsCounter);
writer.append(Double.valueOf(valueDouble/totalWordsCounterDouble).toString());
writer.append('\n');
}
// generate whatever data you want
writer.flush();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
package com.ourOffice;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;
import java.util.concurrent.*;
//This class is used for storing input word pool file name and word list
class WordPool {
private ArrayList<String> wordList;
private String fileName;
public ArrayList<String> getWordList() {
return wordList;
}
public void setWordList(ArrayList<String> wordList) {
this.wordList = wordList;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
}
// This class is used for storing filename, Hashmap containing the words and
// their frequency of occurence in the files in the given folder
class DocFrequencyHashMap {
private HashMap<String, Integer> hm;
private String fileName;
public HashMap<String, Integer> getHm() {
return hm;
}
public void setHm(HashMap<String, Integer> hm) {
this.hm = hm;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
}
public class ProcessDocuments {
static class ProcessDoc implements Callable<DocFrequencyHashMap> {
private File path;
private DocFrequencyHashMap dhm;
private ArrayList<String> arr;
private String fileName;
ProcessDoc(File path, DocFrequencyHashMap dhm, ArrayList<String> arr,
String fileName) {
this.path = path;
this.dhm = dhm;
this.arr = arr;
this.fileName = fileName;
}
/*
* Each folder is executed in a different thread. The code below reads
* each data file in the specified folder. As long as there is a match,
* the search string is searched inside the data file set. Once
* a match is found, the hashmap is updated with the value and next
* search string is used for consideration. (non-Javadoc)
*
* @see java.util.concurrent.Callable#call()
*/
@Override
public DocFrequencyHashMap call() {
HashMap<String, Integer> hm = new HashMap<String, Integer>();
for (File fileEntry : path.listFiles()) {
// if it is a file
if (fileEntry.isFile()) {
try (FileInputStream fstream = new FileInputStream(
fileEntry);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(
new InputStreamReader(in));) {
// reads a file
String strLine;
// tokenizes based on spaces
ArrayList<String> arr1 = new ArrayList<String>();
while ((strLine = br.readLine()) != null) {
StringTokenizer st = new StringTokenizer(strLine,
" ");
while (st.hasMoreTokens()) {
arr1.add(st.nextToken().trim().toLowerCase());
}
}
// Earlier way of doing the comparison
/* for (String searchWord : arr) {
for (String testingWord : arr1) {
if (searchWord.equalsIgnoreCase(testingWord)) {
if (hm.containsKey(searchWord.toLowerCase())) {
hm.put(searchWord.toLowerCase(),
hm.get(searchWord.toLowerCase()) + 1);
} else {
hm.put(searchWord.toLowerCase(), 1);
}
break;
}
}
}*/
// More efficient way of doing comparison
Set<String> termsSet = new HashSet<String>(arr);
Set<String> documentTermsSet = new HashSet<String>(arr1); //
Set<String> intersectionSet = new HashSet<String>(
termsSet);
intersectionSet.retainAll(documentTermsSet); //
if (!intersectionSet.isEmpty()) {
for (String str : intersectionSet) {
if (hm.containsKey(str)) {
hm.put(str,hm.get(str) + 1);
} else {
hm.put(str, 1);
}
}
}
dhm.setHm(hm);
dhm.setFileName(fileName);
} catch (FileNotFoundException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
}
}
return dhm;
}
}
static ExecutorService executor;
public static void main(String[] args) throws Exception {
if (args.length < 3) {
System.out
.println("Usage::: java <package name>ProcessDocuments <input files folder location> <data files root folder location> <output file to be stored>");
System.exit(0);
}
File inputFolder = new File(args[0]);
File folder = new File(args[1]);
File outputFolder = new File(args[2]);
// get input
ArrayList<WordPool> wordPoolArrayList = createTermsArray(inputFolder);
Collection<ProcessDoc> collection = new ArrayList<ProcessDoc>();
File[] folders = folder.listFiles();
long startTime = System.currentTimeMillis();
executor = Executors.newFixedThreadPool(folders.length);
int i = 0;
for (File fileEntry1 : folders) {
if (fileEntry1.isDirectory()) {
for (File fileEntry : fileEntry1.listFiles()) {
if (fileEntry.isDirectory()
&& fileEntry.getName().equalsIgnoreCase("text")) {
ProcessDoc sum = new ProcessDoc(fileEntry,
new DocFrequencyHashMap(), wordPoolArrayList.get(i)
.getWordList(), wordPoolArrayList.get(i++)
.getFileName());
collection.add(sum);
}
}
}
}
List<Future<DocFrequencyHashMap>> list = executor.invokeAll(collection);
i = 0;
long endTime = System.currentTimeMillis();
System.out.println("Total number of threads executed::"
+ folders.length);
System.out.println("Timing " + (endTime - startTime) + " milliseconds");
for (Future<DocFrequencyHashMap> fut : list) {
// System.out.println(fut.isDone());
// System.out.println(fut.get());
// output file name can be changed here.
String fileName = fut.get().getFileName();
String t = outputFolder + "\\output_" + fileName + ".csv";
generateCsvFile(t, fut.get().getHm(), 1000);
System.out.println("Output CSV files are generated in :: " + t);
}
executor.shutdown();
}
/*
* Terms Array is created by parsing the input files in the input folder.
* Each index corresponds to each file and the value is a String array of
* the data in the file.
*/
public static ArrayList<WordPool> createTermsArray(File folder) {
ArrayList<WordPool> wordPoolArrayList = new ArrayList<WordPool>();
for (File fileEntry : folder.listFiles()) {
if (fileEntry.isDirectory()) {
createTermsArray(fileEntry);
} else {
// if it is a file
if (fileEntry.isFile()) {
WordPool wordPool = new WordPool();
// checks if it is a txt file
try (FileInputStream fstream = new FileInputStream(
fileEntry);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(
new InputStreamReader(in));) {
ArrayList<String> tokenizedTerms = new ArrayList<String>();
String strLine = null;
while ((strLine = br.readLine()) != null)
tokenizedTerms.add(strLine.trim().toLowerCase());
String fileName = fileEntry.getName();
wordPool.setFileName(fileEntry.getName().trim()
.substring(0, fileName.length() - 4));
wordPool.setWordList(tokenizedTerms);
wordPoolArrayList.add(wordPool);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
return wordPoolArrayList;
}
/**
* @param fileName
* @param hMap
* @param totalWordsCounter
*/
static void generateCsvFile(String fileName, HashMap<String, Integer> hMap,
int totalWordsCounter) {
try (FileWriter writer = new FileWriter(fileName, false);) {
writer.append("Word");
writer.append(',');
writer.append("Doc Frequency");
writer.append(',');
writer.append("Doc Frequency/1000");
writer.append('\n');
for (String key : hMap.keySet()) {
String value = hMap.get(key).toString();
double valueDouble = Double.parseDouble(value);
double totalWordsCounterDouble = totalWordsCounter;
writer.append(key);
writer.append(',');
writer.append(value);
writer.append(',');
// If number of occurrences / Total Words is needed, uncomment
// the next line and comment the one after next line.
// writer.append(value + "/" + totalWordsCounter);
writer.append(Double.valueOf(
valueDouble / totalWordsCounterDouble).toString());
writer.append('\n');
}
writer.flush();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
*
*/
package com.ourOffice;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.StringTokenizer;
/**
* @author
*
*/
public class ReadFilesFromFolder {
private static String temp = "";
public static void listFilesForFolder(File folder) {
HashMap<String, Integer> map = null;
for (File fileEntry : folder.listFiles()) {
if (fileEntry.isDirectory()) {
// System.out.println("Reading files under the folder "+folder.getAbsolutePath());
listFilesForFolder(fileEntry);
} else {
//if it is a file
if (fileEntry.isFile()) {
map = new HashMap<>();
temp = fileEntry.getName();
//checks if it is a txt file
String fileName = temp.substring(0, temp.length()-4);
if ((temp.substring(temp.lastIndexOf('.') + 1,
temp.length()).toLowerCase()).equals("txt")) {
System.out.println("File= " + folder.getAbsolutePath()
+ "\\" + fileEntry.getName());
try(FileInputStream fstream = new FileInputStream(
fileEntry);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(
new InputStreamReader(in));) {
//reads a file
String strLine;
int totalNumberOfWords = 0;
while ((strLine = br.readLine()) != null) {
//tokenizes based on spaces
StringTokenizer st = new StringTokenizer(strLine, " ");
while (st.hasMoreTokens()) {
String word = st.nextToken();
//adds the word to hashmap and updates the counter
totalNumberOfWords++;
if (map.containsKey(word.toLowerCase())) {
map.put(word.toLowerCase(), map.get(word.toLowerCase()) + 1);
} else {
map.put(word.toLowerCase(), 1);
}
}
}
//file to be generated in this path.
//System.out.println("ABS::"+folder.getAbsolutePath());
String t = folder.getAbsolutePath()+"\\"+fileName+".csv";
CSVGenerator.generateCsvFile(t,map,totalNumberOfWords);
} catch (FileNotFoundException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
}
}
}
}
}
}
The specs for the problem are as follows :
1. There are 11 .txt input files which will contain list of words.
2. Output will be 11 .csv files with columns word , document frequency , doc frequency/1000
2. We will need to search for each word in the 1000 text files(document) and calculate document freq ie in how many documents does the word occur.
eg - input file may look like
planet
moon
sun
eg - output file may look like
word doc freq doc frequency/1000
================================
planet 10 10/1000
moon 5 5/1000
sun 1 1/1000
/**
*
*/
package com.ourOffice;
import java.io.File;
/**
* @author
*
*/
public class WordCounter {
public static File folder = new File("C:\\MyTestingRoot");
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
System.out.println("Reading files under the folder "+ folder.getAbsolutePath());
ReadFilesFromFolder.listFilesForFolder(folder);
//ReadFilesFromFolder.listFilesForFolder(new File(args[0]));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment