arosien/LargestNestedSet.java

## LargestNestedSet.java
package asr.nestedsets;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;

/**
 * Run with -Xmx512m for the standard wordlist.txt, i.e., "java -Xmx512m asr.nestedsets.LargestNestedSet wordlist.txt".
 */
public class LargestNestedSet
{
    // Hold the words in a trie (prefix tree), using O(unique prefixes) space.
    private final TrieNode               root              = new TrieNode(null);
    // wordDepth of the TrieNode(s) with the greatest wordDepth, potentially updated on each insert().
    private int                          largestWordDepth  = -1;
    // There may be more than one largest nested set, potentially updated on each insert().
    private final Set<SortedSet<String>> largestNestedSets = new HashSet<SortedSet<String>>();

    /**
     * Read a wordlist and print the largest nested set(s).
     *
     * @param args args[0] is the filename to read to compute the largest nested set, other args ignored
     * @throws IOException
     */
    public static void main(String[] args) throws IOException
    {
        LargestNestedSet lns = new LargestNestedSet();

        BufferedReader reader = new BufferedReader(new FileReader(args[0]));
        String word;
        int count = 0;

        // Insert words, showing progress as we go. Incremental memory use should decrease as more trie nodes are shared.
        long prevUsedMemory = 0;
        System.err.println(String.format("One '.' per 1000 words processed, used memory is %s", usedMemory()));
        while ((word = reader.readLine()) != null) {
            lns.insert(word);

            if (count++ % 1000 == 0) {
                System.err.print(".");
            }

            if (count % 100000 == 0) {
                long usedMemory = usedMemory();
                System.err.println(String.format(" %dmb (+%dmb)", usedMemory, usedMemory - prevUsedMemory));
                prevUsedMemory = usedMemory;
            }
        }

        long usedMemory = usedMemory();
        System.err.println(String.format("%dmb (+%dmb)", usedMemory, usedMemory - prevUsedMemory));
        System.err.println(String.format("%d words processed", count));

        // System.err.println(lns);
        System.out.println(String.format("Largest nested set(s) are: %s", lns.getLargestNestedSets()));
    }

    private static long usedMemory()
    {
        Runtime runtime = Runtime.getRuntime();
        long mb = 1024 * 1024;
        return (runtime.totalMemory() - runtime.freeMemory()) / mb;
    }

    /**
     * Insert a word, updating the largest nested set(s) as a side-effect.
     *
     * @param word
     */
    public void insert(String word)
    {
        // Assume word is normalized, i.e., lowercased, etc.
        if (word.isEmpty()) {
            return;
        }

        // Iterative addition of word, letter by letter, to trie, O(word length) time.
        TrieNode node = root;
        int wordDepth = 0;

        for (int i = 0; i < word.length(); i++) {
            node = node.getChild(word.charAt(i));

            if (node.isWord()) {
                wordDepth++; // Keep track of how many words are above the word we're inserting.
            }
        }

        // Mark the final node as a word in the trie.
        node.setWord(word, wordDepth);
    }

    public Set<SortedSet<String>> getLargestNestedSets()
    {
        return largestNestedSets;
    }

    private class TrieNode
    {
        private TrieNode[] children  = new TrieNode[26]; // One for each English letter; could possibly compress since most children will be null.
        private TrieNode   parent    = null;
        private String     word      = null;            // The word this node represents, null if an intermediate non-word node.
        private int        wordDepth = 0;               // If a word, how many words (exclusive) are above this in the trie.

        public TrieNode(TrieNode parent)
        {
            this.parent = parent;
        }

        public TrieNode getChild(char letter)
        {
            int offset = letter - 'a'; // 'a' is at 0, etc.
            TrieNode child = children[offset];

            // Lazily create child.
            if (child == null) {
                child = new TrieNode(this);
                children[offset] = child;
            }

            return child;
        }

        public void setWord(String word, int wordDepth)
        {
            this.word = word;
            this.wordDepth = wordDepth;

            // This node may be the deepest wordDepth-wise and thus have the largest nested set.
            updateLargestNestedSets();

            // We may be adding a word that is already a substring of another, so adjust wordDepth (and largest nested sets) for sub-nodes.
            incrementWordDepth();
        }

        private Collection<TrieNode> nonNullChildren()
        {
            ArrayList<TrieNode> c = new ArrayList<TrieNode>();
            for (int i = 0; i < children.length; i++) {
                if (children[i] != null) {
                    c.add(children[i]);
                }
            }
            return c;
        }

        /*
         * O(number of sub-nodes) ~= O(number of words that are substring of current word)
         */
        private void incrementWordDepth()
        {
            Collection<TrieNode> nodes = nonNullChildren();

            // Iteratively
            while (!nodes.isEmpty()) {
                ArrayList<TrieNode> newNodes = new ArrayList<TrieNode>();

                for (TrieNode node : nodes) {
                    if (node.isWord()) {
                        node.wordDepth++;
                        node.updateLargestNestedSets(); // This word may now be a largest nested set.
                    }

                    newNodes.addAll(node.nonNullChildren());
                }

                nodes = newNodes;
            }
        }

        /*
         * Potentially set or add this word set as the largest.
         */
        private void updateLargestNestedSets()
        {
            if (wordDepth >= largestWordDepth) {
                if (wordDepth > largestWordDepth) {
                    // System.err.println(String.format("Clearing old sets of length %d: %s", largestWordDepth, largestNestedSets));
                    largestNestedSets.clear();
                    largestWordDepth = wordDepth;
                }

                largestNestedSets.add(getNestedSet());
            }
        }

        /*
         * O(depth of node) ~= O(wordDepth)
         */
        public SortedSet<String> getNestedSet()
        {
            SortedSet<String> set = new TreeSet<String>();
            TrieNode node = this;

            while (node.parent != null) {
                if (node.isWord()) {
                    set.add(node.getWord());
                }

                node = node.parent;
            }

            return set;
        }

        public String getWord()
        {
            return word;
        }

        public boolean isWord()
        {
            return word != null;
        }

        @Override
        public String toString()
        {
            StringBuilder c = new StringBuilder();
            c.append('[');
            for (int i = 0; i < children.length; i++) {
                if (children[i] != null) {
                    c.append((char) (i + 'a'));
                    c.append("=");
                    c.append(children[i]);
                }
            }
            c.append(']');
            return String.format("[%schildren=%s]", word == null ? "" : String.format("word=%s,wordDepth=%d,", word, wordDepth), c);
        }
    }

    @Override
    public String toString()
    {
        return root.toString();
    }
}
	package asr.nestedsets;

	import java.io.BufferedReader;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collection;
	import java.util.HashSet;
	import java.util.Set;
	import java.util.SortedSet;
	import java.util.TreeSet;

	/**
	* Run with -Xmx512m for the standard wordlist.txt, i.e., "java -Xmx512m asr.nestedsets.LargestNestedSet wordlist.txt".
	*/
	public class LargestNestedSet
	{
	// Hold the words in a trie (prefix tree), using O(unique prefixes) space.
	private final TrieNode root = new TrieNode(null);
	// wordDepth of the TrieNode(s) with the greatest wordDepth, potentially updated on each insert().
	private int largestWordDepth = -1;
	// There may be more than one largest nested set, potentially updated on each insert().
	private final Set<SortedSet<String>> largestNestedSets = new HashSet<SortedSet<String>>();

	/**
	* Read a wordlist and print the largest nested set(s).
	*
	* @param args args[0] is the filename to read to compute the largest nested set, other args ignored
	* @throws IOException
	*/
	public static void main(String[] args) throws IOException
	{
	LargestNestedSet lns = new LargestNestedSet();

	BufferedReader reader = new BufferedReader(new FileReader(args[0]));
	String word;
	int count = 0;

	// Insert words, showing progress as we go. Incremental memory use should decrease as more trie nodes are shared.
	long prevUsedMemory = 0;
	System.err.println(String.format("One '.' per 1000 words processed, used memory is %s", usedMemory()));
	while ((word = reader.readLine()) != null) {
	lns.insert(word);

	if (count++ % 1000 == 0) {
	System.err.print(".");
	}

	if (count % 100000 == 0) {
	long usedMemory = usedMemory();
	System.err.println(String.format(" %dmb (+%dmb)", usedMemory, usedMemory - prevUsedMemory));
	prevUsedMemory = usedMemory;
	}
	}

	long usedMemory = usedMemory();
	System.err.println(String.format("%dmb (+%dmb)", usedMemory, usedMemory - prevUsedMemory));
	System.err.println(String.format("%d words processed", count));

	// System.err.println(lns);
	System.out.println(String.format("Largest nested set(s) are: %s", lns.getLargestNestedSets()));
	}

	private static long usedMemory()
	{
	Runtime runtime = Runtime.getRuntime();
	long mb = 1024 * 1024;
	return (runtime.totalMemory() - runtime.freeMemory()) / mb;
	}

	/**
	* Insert a word, updating the largest nested set(s) as a side-effect.
	*
	* @param word
	*/
	public void insert(String word)
	{
	// Assume word is normalized, i.e., lowercased, etc.
	if (word.isEmpty()) {
	return;
	}

	// Iterative addition of word, letter by letter, to trie, O(word length) time.
	TrieNode node = root;
	int wordDepth = 0;

	for (int i = 0; i < word.length(); i++) {
	node = node.getChild(word.charAt(i));

	if (node.isWord()) {
	wordDepth++; // Keep track of how many words are above the word we're inserting.
	}
	}

	// Mark the final node as a word in the trie.
	node.setWord(word, wordDepth);
	}

	public Set<SortedSet<String>> getLargestNestedSets()
	{
	return largestNestedSets;
	}

	private class TrieNode
	{
	private TrieNode[] children = new TrieNode[26]; // One for each English letter; could possibly compress since most children will be null.
	private TrieNode parent = null;
	private String word = null; // The word this node represents, null if an intermediate non-word node.
	private int wordDepth = 0; // If a word, how many words (exclusive) are above this in the trie.

	public TrieNode(TrieNode parent)
	{
	this.parent = parent;
	}

	public TrieNode getChild(char letter)
	{
	int offset = letter - 'a'; // 'a' is at 0, etc.
	TrieNode child = children[offset];

	// Lazily create child.
	if (child == null) {
	child = new TrieNode(this);
	children[offset] = child;
	}

	return child;
	}

	public void setWord(String word, int wordDepth)
	{
	this.word = word;
	this.wordDepth = wordDepth;

	// This node may be the deepest wordDepth-wise and thus have the largest nested set.
	updateLargestNestedSets();

	// We may be adding a word that is already a substring of another, so adjust wordDepth (and largest nested sets) for sub-nodes.
	incrementWordDepth();
	}

	private Collection<TrieNode> nonNullChildren()
	{
	ArrayList<TrieNode> c = new ArrayList<TrieNode>();
	for (int i = 0; i < children.length; i++) {
	if (children[i] != null) {
	c.add(children[i]);
	}
	}
	return c;
	}

	/*
	* O(number of sub-nodes) ~= O(number of words that are substring of current word)
	*/
	private void incrementWordDepth()
	{
	Collection<TrieNode> nodes = nonNullChildren();

	// Iteratively
	while (!nodes.isEmpty()) {
	ArrayList<TrieNode> newNodes = new ArrayList<TrieNode>();

	for (TrieNode node : nodes) {
	if (node.isWord()) {
	node.wordDepth++;
	node.updateLargestNestedSets(); // This word may now be a largest nested set.
	}

	newNodes.addAll(node.nonNullChildren());
	}

	nodes = newNodes;
	}
	}

	/*
	* Potentially set or add this word set as the largest.
	*/
	private void updateLargestNestedSets()
	{
	if (wordDepth >= largestWordDepth) {
	if (wordDepth > largestWordDepth) {
	// System.err.println(String.format("Clearing old sets of length %d: %s", largestWordDepth, largestNestedSets));
	largestNestedSets.clear();
	largestWordDepth = wordDepth;
	}

	largestNestedSets.add(getNestedSet());
	}
	}

	/*
	* O(depth of node) ~= O(wordDepth)
	*/
	public SortedSet<String> getNestedSet()
	{
	SortedSet<String> set = new TreeSet<String>();
	TrieNode node = this;

	while (node.parent != null) {
	if (node.isWord()) {
	set.add(node.getWord());
	}

	node = node.parent;
	}

	return set;
	}

	public String getWord()
	{
	return word;
	}

	public boolean isWord()
	{
	return word != null;
	}

	@Override
	public String toString()
	{
	StringBuilder c = new StringBuilder();
	c.append('[');
	for (int i = 0; i < children.length; i++) {
	if (children[i] != null) {
	c.append((char) (i + 'a'));
	c.append("=");
	c.append(children[i]);
	}
	}
	c.append(']');
	return String.format("[%schildren=%s]", word == null ? "" : String.format("word=%s,wordDepth=%d,", word, wordDepth), c);
	}
	}

	@Override
	public String toString()
	{
	return root.toString();
	}
	}