Skip to content

Instantly share code, notes, and snippets.

@arosien
Created July 8, 2009 17:15
Show Gist options
  • Save arosien/142978 to your computer and use it in GitHub Desktop.
Save arosien/142978 to your computer and use it in GitHub Desktop.
package asr.nestedsets;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
/**
* Run with -Xmx512m for the standard wordlist.txt, i.e., "java -Xmx512m asr.nestedsets.LargestNestedSet wordlist.txt".
*/
public class LargestNestedSet
{
// Hold the words in a trie (prefix tree), using O(unique prefixes) space.
private final TrieNode root = new TrieNode(null);
// wordDepth of the TrieNode(s) with the greatest wordDepth, potentially updated on each insert().
private int largestWordDepth = -1;
// There may be more than one largest nested set, potentially updated on each insert().
private final Set<SortedSet<String>> largestNestedSets = new HashSet<SortedSet<String>>();
/**
* Read a wordlist and print the largest nested set(s).
*
* @param args args[0] is the filename to read to compute the largest nested set, other args ignored
* @throws IOException
*/
public static void main(String[] args) throws IOException
{
LargestNestedSet lns = new LargestNestedSet();
BufferedReader reader = new BufferedReader(new FileReader(args[0]));
String word;
int count = 0;
// Insert words, showing progress as we go. Incremental memory use should decrease as more trie nodes are shared.
long prevUsedMemory = 0;
System.err.println(String.format("One '.' per 1000 words processed, used memory is %s", usedMemory()));
while ((word = reader.readLine()) != null) {
lns.insert(word);
if (count++ % 1000 == 0) {
System.err.print(".");
}
if (count % 100000 == 0) {
long usedMemory = usedMemory();
System.err.println(String.format(" %dmb (+%dmb)", usedMemory, usedMemory - prevUsedMemory));
prevUsedMemory = usedMemory;
}
}
long usedMemory = usedMemory();
System.err.println(String.format("%dmb (+%dmb)", usedMemory, usedMemory - prevUsedMemory));
System.err.println(String.format("%d words processed", count));
// System.err.println(lns);
System.out.println(String.format("Largest nested set(s) are: %s", lns.getLargestNestedSets()));
}
private static long usedMemory()
{
Runtime runtime = Runtime.getRuntime();
long mb = 1024 * 1024;
return (runtime.totalMemory() - runtime.freeMemory()) / mb;
}
/**
* Insert a word, updating the largest nested set(s) as a side-effect.
*
* @param word
*/
public void insert(String word)
{
// Assume word is normalized, i.e., lowercased, etc.
if (word.isEmpty()) {
return;
}
// Iterative addition of word, letter by letter, to trie, O(word length) time.
TrieNode node = root;
int wordDepth = 0;
for (int i = 0; i < word.length(); i++) {
node = node.getChild(word.charAt(i));
if (node.isWord()) {
wordDepth++; // Keep track of how many words are above the word we're inserting.
}
}
// Mark the final node as a word in the trie.
node.setWord(word, wordDepth);
}
public Set<SortedSet<String>> getLargestNestedSets()
{
return largestNestedSets;
}
private class TrieNode
{
private TrieNode[] children = new TrieNode[26]; // One for each English letter; could possibly compress since most children will be null.
private TrieNode parent = null;
private String word = null; // The word this node represents, null if an intermediate non-word node.
private int wordDepth = 0; // If a word, how many words (exclusive) are above this in the trie.
public TrieNode(TrieNode parent)
{
this.parent = parent;
}
public TrieNode getChild(char letter)
{
int offset = letter - 'a'; // 'a' is at 0, etc.
TrieNode child = children[offset];
// Lazily create child.
if (child == null) {
child = new TrieNode(this);
children[offset] = child;
}
return child;
}
public void setWord(String word, int wordDepth)
{
this.word = word;
this.wordDepth = wordDepth;
// This node may be the deepest wordDepth-wise and thus have the largest nested set.
updateLargestNestedSets();
// We may be adding a word that is already a substring of another, so adjust wordDepth (and largest nested sets) for sub-nodes.
incrementWordDepth();
}
private Collection<TrieNode> nonNullChildren()
{
ArrayList<TrieNode> c = new ArrayList<TrieNode>();
for (int i = 0; i < children.length; i++) {
if (children[i] != null) {
c.add(children[i]);
}
}
return c;
}
/*
* O(number of sub-nodes) ~= O(number of words that are substring of current word)
*/
private void incrementWordDepth()
{
Collection<TrieNode> nodes = nonNullChildren();
// Iteratively
while (!nodes.isEmpty()) {
ArrayList<TrieNode> newNodes = new ArrayList<TrieNode>();
for (TrieNode node : nodes) {
if (node.isWord()) {
node.wordDepth++;
node.updateLargestNestedSets(); // This word may now be a largest nested set.
}
newNodes.addAll(node.nonNullChildren());
}
nodes = newNodes;
}
}
/*
* Potentially set or add this word set as the largest.
*/
private void updateLargestNestedSets()
{
if (wordDepth >= largestWordDepth) {
if (wordDepth > largestWordDepth) {
// System.err.println(String.format("Clearing old sets of length %d: %s", largestWordDepth, largestNestedSets));
largestNestedSets.clear();
largestWordDepth = wordDepth;
}
largestNestedSets.add(getNestedSet());
}
}
/*
* O(depth of node) ~= O(wordDepth)
*/
public SortedSet<String> getNestedSet()
{
SortedSet<String> set = new TreeSet<String>();
TrieNode node = this;
while (node.parent != null) {
if (node.isWord()) {
set.add(node.getWord());
}
node = node.parent;
}
return set;
}
public String getWord()
{
return word;
}
public boolean isWord()
{
return word != null;
}
@Override
public String toString()
{
StringBuilder c = new StringBuilder();
c.append('[');
for (int i = 0; i < children.length; i++) {
if (children[i] != null) {
c.append((char) (i + 'a'));
c.append("=");
c.append(children[i]);
}
}
c.append(']');
return String.format("[%schildren=%s]", word == null ? "" : String.format("word=%s,wordDepth=%d,", word, wordDepth), c);
}
}
@Override
public String toString()
{
return root.toString();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment