Skip to content

Instantly share code, notes, and snippets.

@darkcrawler01
Created April 22, 2014 20:03
Show Gist options
  • Save darkcrawler01/11192422 to your computer and use it in GitHub Desktop.
Save darkcrawler01/11192422 to your computer and use it in GitHub Desktop.
Code I wrote for a programming challenge, Not sure why I didnt clear it though
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
class WordNode{
int index= -1;
int subWords = 0;
public WordNode(int ind, int subWords) {
this.index = ind;
this.subWords = subWords;
}
/*
* (non-Javadoc)
* @see java.lang.Object#equals(java.lang.Object)
* Overriding primarily for list contains checking
*/
@Override
public boolean equals(Object other)
{
return other instanceof WordNode && this.index == ((WordNode)other).index
&& this.subWords == ((WordNode)other).subWords;
}
}
/**
* @author darkcrawler01
* @since 04/21/2014
* Write a program that reads a file containing a sorted list of words (one word per line, no spaces, all lower case), then identifies the
1. 1st longest word in the file that can be constructed by concatenating copies of shorter words also found in the file.
2. The program should then go on to report the 2nd longest word found
3. Total count of how many of the words in the list can be constructed of other words in the list.
*
* If pruning is allowed ie we dont care about the total of compound words(part 3),
* I am able to achieve a computation time of 0.12 secs in i5 MBP laptop but removed it later
*/
public class LongestCompoundWord {
/*
* This method reads the input file containing one word per line, finds the compounded words in the input words and prints
* TWe have tried to increase the performance as much as possible by making some trade offs which are mentioned in the comments
*/
public static void findLongestCompoundWord(String fileName) throws IOException
{
Long startTime = System.nanoTime();
//All data structures are initialized with capacity specified in the problem statement to improve performance
//The problem statement mentions that the given data contains around 173k words so we initialize the HashSet with 174k
HashSet<String> lookupWords = new HashSet<>(1740000);
String line = null;
int maxLength = 0, minLength = Integer.MAX_VALUE;
try (BufferedReader br = new BufferedReader(new FileReader(fileName))){
while ((line = br.readLine()) != null)
{
//working on the assumption that the each line contains a single word without any extra space
if (line.length() == 0)
{
//System.out.println(line + "," + lookupWords.size());
continue;
}
//updating minLength
if (line.length() < minLength) {
minLength = line.length();
}
//updating maxLength
if (line.length() > maxLength)
{
maxLength = line.length() ;
}
lookupWords.add(line);
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
throw e;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
throw e;
}
double endTime = System.nanoTime() - startTime;
endTime /= Math.pow(10, 9);
System.out.println("Total No.of words = " + lookupWords.size() + ", min word Length=" + minLength + ", max word Length = " + maxLength);
System.out.println("File readtime:"+ endTime +" (secs)");
//compoundedWords will contain the final solution
//I try to avoid sorting of the solution by directly aggregating the solution in buckets
//Each bucket is only allowed to collect the words having the same no. of subwords
//There are a couple of trade-offs in this approach:
// 1. (Obvious) The same word might be collected in two different bucket resulting in counting them twice
// 2. The same word might be present in the same bucket more than once since we are using an ArrayList
// For unique entries I should have taken a List<HashSet> instead
List<ArrayList<String>> compoundedWords = new ArrayList<ArrayList<String>>(maxLength/minLength);
//pre-initializing the data structure
for (int i = 0; i < maxLength/minLength + 1; i ++)
{
compoundedWords.add( new ArrayList<String>(25));
}
startTime = System.nanoTime();
int countCompoundWords = 0;
WordNode currWordNode = null;
int maxSubWords = 0, prevMaxSubWords = 0 ;
//Time Complexity: O(n(m^2))
//n - number of words
//m - maxlength
for (String currWord: lookupWords)
{
//Approach:
//1. Split the string into substring starting with index 0 which occur in the input word list
//2. Repeat step 1 starting at the endindex of substrings identified in step 1
// Until end of string is reached
List<WordNode> subWordsIndexList = new ArrayList<>(10);
//The WordNode keeps track of the current substring's start index and no of subwords so far
subWordsIndexList.add(new WordNode(0, 0));
int listIndex = 0;
while (listIndex < subWordsIndexList.size())
{
//the currWordNode is not removed from the list like in tradition DFS
//Reason: I avoid having a separate visited list of nodes
// also this current approach was faster than using a hashset
currWordNode = subWordsIndexList.get(listIndex++);
for (int k = currWordNode.index + 1; k <= currWord.length(); k++)
{
if (lookupWords.contains(currWord.substring(currWordNode.index, k)))
{
//termination condition
if (k >= currWord.length() && currWordNode.subWords > 0)
{
//note this doesnt give us the distinct count of compound words rather all possible ways of attaining a compound word
countCompoundWords++;
//sub optimizing the collecting final solution
//since we are only interested in the top two buckets
if (currWordNode.subWords + 1 >= prevMaxSubWords)
{
if (currWordNode.subWords + 1 >= prevMaxSubWords && currWordNode.subWords + 1 < maxSubWords)
{
prevMaxSubWords = maxSubWords;
maxSubWords = currWordNode.subWords + 1;
}
//roughly sorting the compounded words based on number of subWords for the currWord
compoundedWords.get(currWordNode.subWords + 1).add(currWord);
}
}
else
{
WordNode newNode = new WordNode(k, currWordNode.subWords + 1);
if (!subWordsIndexList.contains(newNode))
subWordsIndexList.add(newNode);
}
}
}
}
}
endTime = System.nanoTime() - startTime;
endTime /= Math.pow(10, 9);
System.out.println("Computation time:"+ endTime +" (secs)");
int count = 0;
for (int i = compoundedWords.size() - 1; i > 0; i--)
{
if (compoundedWords.get(i).size() > 0)
{
if (count == 0)
{
System.out.println("Longest word(s) with " +i + " subwords: " + compoundedWords.get(i));
count++;
} else
{
System.out.println("Second Longest word(s) with " +i + " subwords: " + compoundedWords.get(i));
break;
}
}
}
System.out.println("Total No. of Compound Words in the list: " + countCompoundWords);
System.out.println("Note: These results contain all the ways of attaining compound words and are not distinct");
}
public static void main(String args[])
{
try {
LongestCompoundWord.findLongestCompoundWord("wordsforproblem.txt");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment