darkcrawler01/LongestCompoundWord.java

## LongestCompoundWord.java

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;


class WordNode{

	int index= -1;
	int subWords = 0;

	public WordNode(int ind, int subWords) {
		this.index = ind;
		this.subWords = subWords;
	}

	/*
	 * (non-Javadoc)
	 * @see java.lang.Object#equals(java.lang.Object)
	 * Overriding primarily for list contains checking
	 */
	@Override
	public boolean equals(Object other)
	{
		return other instanceof WordNode && this.index == ((WordNode)other).index
				&& this.subWords == ((WordNode)other).subWords;
	}

}


/**
 * @author darkcrawler01
 * @since 04/21/2014
 * Write a program that reads a file containing a sorted list of words (one word per line, no spaces, all lower case), then identifies the
	1. 1st  longest word in the file that can be constructed by concatenating copies of shorter words also found in the file.
	2. The program should then go on to report the 2nd longest word found
 	3. Total count of how many of the words in the list can be constructed of other words in the list.
 *
 *  If pruning is allowed ie we dont care about the total of compound words(part 3),
 *  I am able to achieve a computation time of 0.12 secs in i5 MBP laptop but removed it later
 */
public class LongestCompoundWord {


	/*
	 * This method reads the input file containing one word per line, finds the compounded words in the input words and prints
	 * TWe have tried to increase the performance as much as possible by making some trade offs which are mentioned in the comments
	 */
	public static void findLongestCompoundWord(String fileName) throws IOException
	{
		Long startTime = System.nanoTime();
		//All data structures are initialized with capacity specified in the problem statement to improve performance
		//The problem statement mentions that the given data contains around 173k words so we initialize the HashSet with 174k
		HashSet<String> lookupWords = new HashSet<>(1740000);

		String line = null;
		int maxLength = 0, minLength = Integer.MAX_VALUE;


		try (BufferedReader br = new BufferedReader(new FileReader(fileName))){
			while ((line = br.readLine()) != null)
			{
				//working on the assumption that the each line contains a single word without any extra space
				if (line.length() == 0)
				{
					//System.out.println(line + "," + lookupWords.size());
					continue;
				}

				//updating minLength
				if (line.length() < minLength) {
					minLength = line.length();
				}

				//updating maxLength
				if (line.length() > maxLength)
				{
					maxLength = line.length() ;
				}

				lookupWords.add(line);
			}
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			throw e;
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			throw e;
		}
		double endTime =   System.nanoTime() - startTime;
		endTime /= Math.pow(10, 9);

		System.out.println("Total No.of words = " + lookupWords.size() + ", min word Length=" + minLength + ", max word Length = " + maxLength);
		System.out.println("File readtime:"+ endTime +" (secs)");

		//compoundedWords will contain the final solution
		//I try to avoid sorting of the solution by directly aggregating the solution in buckets
		//Each bucket is only allowed to collect the words having the same no. of subwords
		//There are a couple of trade-offs in this approach:
		// 1. (Obvious) The same word might be collected in two different bucket resulting in counting them twice
		// 2. The same word might be present in the same bucket more than once since we are using an ArrayList
		//	  For unique entries I should have taken a List<HashSet> instead
		List<ArrayList<String>> compoundedWords = new ArrayList<ArrayList<String>>(maxLength/minLength);

		//pre-initializing the data structure
		for (int i = 0; i < maxLength/minLength + 1; i ++)
		{
			compoundedWords.add( new ArrayList<String>(25));
		}

		startTime = System.nanoTime();
		int countCompoundWords = 0;
		WordNode currWordNode = null;


		int maxSubWords = 0, prevMaxSubWords = 0 ;

		//Time Complexity: O(n(m^2))
		//n - number of words
		//m - maxlength
		for (String currWord: lookupWords)
		{
			//Approach:
			//1. Split the string into substring starting with index 0 which occur in the input word list
			//2. Repeat step 1 starting at the endindex of substrings identified in step 1
			//   	Until end of string is reached

			List<WordNode> subWordsIndexList = new ArrayList<>(10);

			//The WordNode keeps track of the current substring's start index and no of subwords so far
			subWordsIndexList.add(new WordNode(0, 0));
			int listIndex = 0;
			while (listIndex < subWordsIndexList.size())
			{
				//the currWordNode is not removed from the list like in tradition DFS
				//Reason: I avoid having a separate visited list of nodes
				// also this current approach was faster than using a hashset
				currWordNode = subWordsIndexList.get(listIndex++);

				for (int k = currWordNode.index + 1; k <= currWord.length(); k++)
				{
					if (lookupWords.contains(currWord.substring(currWordNode.index, k)))
					{
						//termination condition
						if (k >= currWord.length()  && currWordNode.subWords  > 0)
						{
							//note this doesnt give us the distinct count of compound words rather all possible ways of attaining a compound word
							countCompoundWords++;

							//sub optimizing the collecting final solution
							//since we are only interested in the top two buckets
							if (currWordNode.subWords + 1 >= prevMaxSubWords)
							{
								if (currWordNode.subWords + 1 >= prevMaxSubWords && currWordNode.subWords + 1 < maxSubWords)
								{
									prevMaxSubWords = maxSubWords;
									maxSubWords = currWordNode.subWords + 1;
								}
								//roughly sorting the compounded words based on number of subWords for the currWord
								compoundedWords.get(currWordNode.subWords + 1).add(currWord);
							}
						}
						else
						{
							WordNode newNode = new WordNode(k, currWordNode.subWords + 1);
							if (!subWordsIndexList.contains(newNode))
								subWordsIndexList.add(newNode);
						}
					}
				}
			}
		}
		endTime =   System.nanoTime() - startTime;
		endTime /= Math.pow(10, 9);
		System.out.println("Computation time:"+ endTime +" (secs)");

		int count = 0;
		for (int i = compoundedWords.size() - 1; i > 0; i--)
		{
			if (compoundedWords.get(i).size() > 0)
			{
				if (count == 0)
				{
					System.out.println("Longest word(s) with " +i +  " subwords: " + compoundedWords.get(i));
					count++;
				} else
				{
					System.out.println("Second Longest word(s) with " +i +  " subwords: " + compoundedWords.get(i));
					break;
				}
			}
		}
		System.out.println("Total No. of Compound Words in the list: " + countCompoundWords);
		System.out.println("Note:  These results contain all the ways of attaining compound words and are not distinct");
	}

	public static void main(String args[])
	{
		try {
			LongestCompoundWord.findLongestCompoundWord("wordsforproblem.txt");
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

}

	import java.io.BufferedReader;
	import java.io.FileNotFoundException;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashSet;
	import java.util.List;


	class WordNode{

	int index= -1;
	int subWords = 0;

	public WordNode(int ind, int subWords) {
	this.index = ind;
	this.subWords = subWords;
	}

	/*
	* (non-Javadoc)
	* @see java.lang.Object#equals(java.lang.Object)
	* Overriding primarily for list contains checking
	*/
	@Override
	public boolean equals(Object other)
	{
	return other instanceof WordNode && this.index == ((WordNode)other).index
	&& this.subWords == ((WordNode)other).subWords;
	}

	}


	/**
	* @author darkcrawler01
	* @since 04/21/2014
	* Write a program that reads a file containing a sorted list of words (one word per line, no spaces, all lower case), then identifies the
	1. 1st longest word in the file that can be constructed by concatenating copies of shorter words also found in the file.
	2. The program should then go on to report the 2nd longest word found
	3. Total count of how many of the words in the list can be constructed of other words in the list.
	*
	* If pruning is allowed ie we dont care about the total of compound words(part 3),
	* I am able to achieve a computation time of 0.12 secs in i5 MBP laptop but removed it later
	*/
	public class LongestCompoundWord {


	/*
	* This method reads the input file containing one word per line, finds the compounded words in the input words and prints
	* TWe have tried to increase the performance as much as possible by making some trade offs which are mentioned in the comments
	*/
	public static void findLongestCompoundWord(String fileName) throws IOException
	{
	Long startTime = System.nanoTime();
	//All data structures are initialized with capacity specified in the problem statement to improve performance
	//The problem statement mentions that the given data contains around 173k words so we initialize the HashSet with 174k
	HashSet<String> lookupWords = new HashSet<>(1740000);

	String line = null;
	int maxLength = 0, minLength = Integer.MAX_VALUE;


	try (BufferedReader br = new BufferedReader(new FileReader(fileName))){
	while ((line = br.readLine()) != null)
	{
	//working on the assumption that the each line contains a single word without any extra space
	if (line.length() == 0)
	{
	//System.out.println(line + "," + lookupWords.size());
	continue;
	}

	//updating minLength
	if (line.length() < minLength) {
	minLength = line.length();
	}

	//updating maxLength
	if (line.length() > maxLength)
	{
	maxLength = line.length() ;
	}

	lookupWords.add(line);
	}
	} catch (FileNotFoundException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	throw e;
	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	throw e;
	}
	double endTime = System.nanoTime() - startTime;
	endTime /= Math.pow(10, 9);

	System.out.println("Total No.of words = " + lookupWords.size() + ", min word Length=" + minLength + ", max word Length = " + maxLength);
	System.out.println("File readtime:"+ endTime +" (secs)");

	//compoundedWords will contain the final solution
	//I try to avoid sorting of the solution by directly aggregating the solution in buckets
	//Each bucket is only allowed to collect the words having the same no. of subwords
	//There are a couple of trade-offs in this approach:
	// 1. (Obvious) The same word might be collected in two different bucket resulting in counting them twice
	// 2. The same word might be present in the same bucket more than once since we are using an ArrayList
	// For unique entries I should have taken a List<HashSet> instead
	List<ArrayList<String>> compoundedWords = new ArrayList<ArrayList<String>>(maxLength/minLength);

	//pre-initializing the data structure
	for (int i = 0; i < maxLength/minLength + 1; i ++)
	{
	compoundedWords.add( new ArrayList<String>(25));
	}

	startTime = System.nanoTime();
	int countCompoundWords = 0;
	WordNode currWordNode = null;


	int maxSubWords = 0, prevMaxSubWords = 0 ;

	//Time Complexity: O(n(m^2))
	//n - number of words
	//m - maxlength
	for (String currWord: lookupWords)
	{
	//Approach:
	//1. Split the string into substring starting with index 0 which occur in the input word list
	//2. Repeat step 1 starting at the endindex of substrings identified in step 1
	// Until end of string is reached

	List<WordNode> subWordsIndexList = new ArrayList<>(10);

	//The WordNode keeps track of the current substring's start index and no of subwords so far
	subWordsIndexList.add(new WordNode(0, 0));
	int listIndex = 0;
	while (listIndex < subWordsIndexList.size())
	{
	//the currWordNode is not removed from the list like in tradition DFS
	//Reason: I avoid having a separate visited list of nodes
	// also this current approach was faster than using a hashset
	currWordNode = subWordsIndexList.get(listIndex++);

	for (int k = currWordNode.index + 1; k <= currWord.length(); k++)
	{
	if (lookupWords.contains(currWord.substring(currWordNode.index, k)))
	{
	//termination condition
	if (k >= currWord.length() && currWordNode.subWords > 0)
	{
	//note this doesnt give us the distinct count of compound words rather all possible ways of attaining a compound word
	countCompoundWords++;

	//sub optimizing the collecting final solution
	//since we are only interested in the top two buckets
	if (currWordNode.subWords + 1 >= prevMaxSubWords)
	{
	if (currWordNode.subWords + 1 >= prevMaxSubWords && currWordNode.subWords + 1 < maxSubWords)
	{
	prevMaxSubWords = maxSubWords;
	maxSubWords = currWordNode.subWords + 1;
	}
	//roughly sorting the compounded words based on number of subWords for the currWord
	compoundedWords.get(currWordNode.subWords + 1).add(currWord);
	}
	}
	else
	{
	WordNode newNode = new WordNode(k, currWordNode.subWords + 1);
	if (!subWordsIndexList.contains(newNode))
	subWordsIndexList.add(newNode);
	}
	}
	}
	}
	}
	endTime = System.nanoTime() - startTime;
	endTime /= Math.pow(10, 9);
	System.out.println("Computation time:"+ endTime +" (secs)");

	int count = 0;
	for (int i = compoundedWords.size() - 1; i > 0; i--)
	{
	if (compoundedWords.get(i).size() > 0)
	{
	if (count == 0)
	{
	System.out.println("Longest word(s) with " +i + " subwords: " + compoundedWords.get(i));
	count++;
	} else
	{
	System.out.println("Second Longest word(s) with " +i + " subwords: " + compoundedWords.get(i));
	break;
	}
	}
	}
	System.out.println("Total No. of Compound Words in the list: " + countCompoundWords);
	System.out.println("Note: These results contain all the ways of attaining compound words and are not distinct");
	}

	public static void main(String args[])
	{
	try {
	LongestCompoundWord.findLongestCompoundWord("wordsforproblem.txt");
	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	}

	}