anastasop/IncreasingBlabla.java

## IncreasingBlabla.java
/*

An Amazing sentence in English from
https://plus.google.com/photos/117176908342196183611/albums/5812811361700087857/5812811362489240738

> I do not know where family doctors acquired illegibly
> perplexing handwriting; nevertheless, extraordinary
> pharmaceutical intellectuality counterbalancing
> indecipherability transendentalizes intercommunication's
> incomprehensibleness

The first word is one letter long, the second word is two letters,
the third word three letters long and it goes on like this
until the twentieth word

This is a java program that tries to generate such sentences.
It read a large text file and it builds a graph, where the nodes
are words and edges go from a word to another iff i) the second word
has exactly one letter more and ii) there is a place in the text file
that it follows the first word.

After constructing the graph it traverses it to generate sentences.
This version generates all possible sentences, which is a large amount
of data. Maybe i should add a bit randomness in it to make it more
practical.

I tried it with the Odyssey and the Bible from the project gutenberg.
The best i could get was 9/10-word sentences, readable but pretty meaningless

Odyssey: I to and till night sprang towards Penelope therefore
Bible: O ye his seed shall thrust another brethren therefore understand

Enjoy
Spyros http://twitter.com/anastasop
*/

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;


class Word {
	String word;
	Set<Word> followers = new HashSet<Word>();

	Word(String s) {
		word = s;
	}

	void addFollower(Word w) {
		followers.add(w);
	}

	int length() {
		return word.length();
	}

	@Override
	public int hashCode() {
		return word.hashCode();
	}

	@Override
	public boolean equals(Object obj) {
		return word.equals(obj);
	}
}

public class IncreasingBlabla {
	public static void main(String[] args) {
		if (args.length != 2) {
			System.err.println("usage: blabla <min sentence len> <file>");
			System.exit(2);
		}
		int minLength = Integer.valueOf(args[0]);
		String text = "";
		try {
			File f = new File(args[1]);
			byte[] content = new byte[(int)f.length()];
			InputStream ist = new FileInputStream(f);
			ist.read(content); // lazy, but it's OK for local files
			ist.close();
			text = new String(content, "UTF-8");
		} catch (Exception e) {
			System.err.println("error: " + e.getMessage());
			System.exit(2);
		}

		Map<String, Word> words = new HashMap<String, Word>();
		Word start = new Word("");
		words.put("", start);

		Word prevWord = start;
		String[] textTokens = text.split("\\s");
		for (String textToken: textTokens) {
			String[] wordTokens = textToken.split("\\p{Punct}");
			for (String wordToken: wordTokens) {
				Word currWord = words.get(wordToken);
				if (currWord == null) {
					currWord = new Word(wordToken);
					words.put(wordToken, currWord);
				}
				if (currWord.length() - prevWord.length() == 1) {
					prevWord.addFollower(currWord);
				}
				prevWord = currWord;
			}
		}

		for (Word startWord: words.values()) {
			if (startWord.length() == 1) {
				traverseGraph(startWord, minLength, new ArrayList<Word>());
			}
		}
	}

	static void traverseGraph(Word w, int minLength, List<Word> currSentence) {
		currSentence.add(w);
		if (w.followers.size() == 0) {
			if (currSentence.size() >= minLength) {
				for (Word cw: currSentence) {
					System.out.print(cw.word);
					System.out.print(" ");
				}
				System.out.println("");
			}
		} else {
			for (Word nextWord: w.followers) {
				traverseGraph(nextWord, minLength, currSentence);
			}
		}
		currSentence.remove(currSentence.size() - 1);
	}
}
	/*

	An Amazing sentence in English from
	https://plus.google.com/photos/117176908342196183611/albums/5812811361700087857/5812811362489240738

	> I do not know where family doctors acquired illegibly
	> perplexing handwriting; nevertheless, extraordinary
	> pharmaceutical intellectuality counterbalancing
	> indecipherability transendentalizes intercommunication's
	> incomprehensibleness

	The first word is one letter long, the second word is two letters,
	the third word three letters long and it goes on like this
	until the twentieth word

	This is a java program that tries to generate such sentences.
	It read a large text file and it builds a graph, where the nodes
	are words and edges go from a word to another iff i) the second word
	has exactly one letter more and ii) there is a place in the text file
	that it follows the first word.

	After constructing the graph it traverses it to generate sentences.
	This version generates all possible sentences, which is a large amount
	of data. Maybe i should add a bit randomness in it to make it more
	practical.

	I tried it with the Odyssey and the Bible from the project gutenberg.
	The best i could get was 9/10-word sentences, readable but pretty meaningless

	Odyssey: I to and till night sprang towards Penelope therefore
	Bible: O ye his seed shall thrust another brethren therefore understand

	Enjoy
	Spyros http://twitter.com/anastasop
	*/

	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileNotFoundException;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.UnsupportedEncodingException;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;


	class Word {
	String word;
	Set<Word> followers = new HashSet<Word>();

	Word(String s) {
	word = s;
	}

	void addFollower(Word w) {
	followers.add(w);
	}

	int length() {
	return word.length();
	}

	@Override
	public int hashCode() {
	return word.hashCode();
	}

	@Override
	public boolean equals(Object obj) {
	return word.equals(obj);
	}
	}

	public class IncreasingBlabla {
	public static void main(String[] args) {
	if (args.length != 2) {
	System.err.println("usage: blabla <min sentence len> <file>");
	System.exit(2);
	}
	int minLength = Integer.valueOf(args[0]);
	String text = "";
	try {
	File f = new File(args[1]);
	byte[] content = new byte[(int)f.length()];
	InputStream ist = new FileInputStream(f);
	ist.read(content); // lazy, but it's OK for local files
	ist.close();
	text = new String(content, "UTF-8");
	} catch (Exception e) {
	System.err.println("error: " + e.getMessage());
	System.exit(2);
	}

	Map<String, Word> words = new HashMap<String, Word>();
	Word start = new Word("");
	words.put("", start);

	Word prevWord = start;
	String[] textTokens = text.split("\\s");
	for (String textToken: textTokens) {
	String[] wordTokens = textToken.split("\\p{Punct}");
	for (String wordToken: wordTokens) {
	Word currWord = words.get(wordToken);
	if (currWord == null) {
	currWord = new Word(wordToken);
	words.put(wordToken, currWord);
	}
	if (currWord.length() - prevWord.length() == 1) {
	prevWord.addFollower(currWord);
	}
	prevWord = currWord;
	}
	}

	for (Word startWord: words.values()) {
	if (startWord.length() == 1) {
	traverseGraph(startWord, minLength, new ArrayList<Word>());
	}
	}
	}

	static void traverseGraph(Word w, int minLength, List<Word> currSentence) {
	currSentence.add(w);
	if (w.followers.size() == 0) {
	if (currSentence.size() >= minLength) {
	for (Word cw: currSentence) {
	System.out.print(cw.word);
	System.out.print(" ");
	}
	System.out.println("");
	}
	} else {
	for (Word nextWord: w.followers) {
	traverseGraph(nextWord, minLength, currSentence);
	}
	}
	currSentence.remove(currSentence.size() - 1);
	}
	}