triss/Markov.java

## README.md

      
    Raw
  

              README.md
            
          
    Markov implimented using a map instead of matrix

Differs from Colin's description in that it:

works with n-grams of any size
operates on Objects of any type

The Markov and Tokenizer classes will work in Processing if you copy and paste classes in to a new tab in the IDE and delete some of the import statements from the top of the files. (Processing has already imported a lot of stuff).
The logic used in MarkovianSentances main method could be ported straight across into Processing's draw()/setup() methods.
Twitter and Temboo

If you want to use the Twitter class it requires that the Temboo library is availiable on the projects class path.
In Processing this can be achieved by installing Temboo using the Sketch -> Import library... -> Add library... menu option, and then adding it to your sketch by selecting Sketch -> Import library... -> Temboo.
If you aren't using Processing the jar you need to add to your Java project can be found here: https://temboo.com/java
Temboo is a friendly wrapper for Facebook, Twitter, Weather services, Github and all sorts of other web services. It makes consuming them much easier. Highly recommend getting familiar if you want an easy way to get data off the internet.

  
## Markov.java
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.util.HashMap;
import java.util.Random;

public class Markov<E> {
    // we'll need a random number generator
    Random random;

    // this map will contain our Markov model
    // Effectively it's a map from n-grams -> probability tables
    // n-grams have type: List<E> - they're lists of E
    // our probability tables have type: Map<E, Integer>
    // so overall the model's type is:
    Map<List<E>, Map<E, Integer>> model;

    /**
     * Constructs a new Markov model from sequence with n-gram size of n.
     *
     * @param n         the size of the ngrams used by this model
     * @param sequence  the source sequence used to generate our model
     */
    public Markov(int n, List<E> sequence) {
        this(new Random(), n, sequence);
    }

    /**
     * Constructs a new Markov model from sequence with n-gram size of n.
     * A Random number generator can be specified reproducing sequences
     *
     * @param randomGen a random number generator used for navigating the model
     * @param n         the size of the n-grams used by this model
     * @param sequence  the source sequence used to generate our model
     */
    public Markov(Random randomGen, int n, List<E> sequence) {
        random = randomGen;
        model = new HashMap<List<E>, Map<E, Integer>>();

        // this is where we build the model - we iterate over the sequence
        for(int i = 0; i < sequence.size() - n; i++) {
            // grab an n-gram,
            List<E> nGram = sequence.subList(i, i + n - 1);

            // then look up the probability table for the current n-gram
            Map<E, Integer> probabilities = model.get(nGram);
            if(probabilities == null) {
                probabilities = new HashMap<E, Integer>();
            }

            // grab the item following our n-gram
            E nextItem = sequence.get(i + n - 1);

            // then look up and increment the count of times the nextItem has
            // followed the current n-gram so far
            Integer count = probabilities.get(nextItem);
            if(count == null) { count = 0; }
            probabilities.put(nextItem, ++count);

            // stick the probabilities/counts in the model
            model.put(nGram, probabilities);
        }
    }

    /**
     * Given an n-gram return a possible next item
     *
     *
     * @param nGram an nGram
     * @returns an item from the model
     */
    public E nextItem(List<E> nGram) {
        // Really simple - it's just a weighted choose on the probability table we
        // stored for each n-gram
        return weightedChoose(model.get(nGram));
    }

    /**
     * Creates a new n-gram from the passed in n-gram and a possible next item
     *
     * @param nGram an nGram
     * @returns an nGram
     */
    public List<E> nextNGram(List<E> nGram) {
        List<E> next = new ArrayList<E>(nGram);
        next.remove(0);            // remove first item
        next.add(nextItem(nGram)); // add new item to end
        return next;
    }

    /**
     * Produces a sequence using the markov model
     *
     * @param n the length of the sequence
     * @param nGram the nGram to start the sequence with
     * @returns sequence of objects from markov model
     */
    public List<E> sequence(int n, List<E> nGram) {
        // we just call nextNGram(..) over and over again adding each item to a list
        List<E> seq = new ArrayList<E>(nGram);
        List<E> next = nGram;
        for(int i = nGram.size(); i < n; i++) {
            next = nextNGram(next);
            seq.add(next.get(next.size() - 1));
        }

        return seq;
    }

    /**
     * Randomly selects an item from a map with the weights specified as values
     *
     * Google weighted choose if you don't know the algorithm! Super useful.
     * (btw. works just as well with Double etc just Ints in this context...)
     *
     * @probabilites a map of from values to there relative probabilities
     *               e.g. with a map like {"cat":1, "dog":1, "monkey":10}
     *               monkey will be returned 10 times as many times as cat or dog
     * @returns a value from the probabilities map
     */
    private E weightedChoose(Map<E, Integer> probabilities) {
        // whizz across the list of probabilities keeping running sum of them
        int runningSum = 0;
        ArrayList<Integer> summedWeights = new ArrayList<Integer>();
        for(int p : probabilities.values()) {
            runningSum += p;
            summedWeights.add(runningSum);
        }

        // choose a number 0 and the sum
        int threshold = random.nextInt(runningSum);

        // find the index of the item to first exceed this number
        int i = 0;
        while(i < probabilities.size() - 1 &&
              threshold < summedWeights.get(i)) {
            i++;
        }

        // pull the item out of the map using the index we now have
        ArrayList<E> items = new ArrayList<E>(probabilities.keySet());
        return items.get(i);
    }

    /**
     * Helper function that returns all of the n-grams used in the model
     * Useful for finding an n-gram to start a sequence with
     *
     * @returns List of n-grams
     */
    public List<List<E>> getNGrams() {
        return new ArrayList<List<E>>(model.keySet());
    }

    /**
     * We'll just use HashMap's .toString()
     * You'll just have to squint
     */
    public String toString() {
        return model.toString();
    }
}

## MarkovianSentences.java
import java.nio.file.Files;
import java.nio.file.Paths;
import java.io.IOException;
import java.util.List;
import java.util.Random;

public class MarkovianSentences {
    public static void main(String[] args) {
        // Get the words we are going to put in our Markov model
        String string = "";
        try {
            byte[] fileBytes = Files.readAllBytes(Paths.get("testFile.txt"));
            string = new String(fileBytes);

            // note you could just as easily in your tweets here
            // string = Twitter.getTweetsForUser("realDonaldTrump");
        } catch(IOException e) {
            System.err.println(e);
            System.exit(1);
        }

        // chop our big string up in to separate words, punctuation etc
        List<String> words = Tokenizer.tokenize(string);

        // This makes a Markov model with trigrams in of our words
        // - change 3 to vary the size of the ngrams used.
        // - note that this class works with any List of Objects, not just Lists
        //   of Strings
        Markov<String> model = new Markov<String>(3, words);

        // get a list of all the ngrams from our model and randomly choose one
        // to start with
        List<List<String>> ngrams = model.getNGrams();
        Random r = new Random();
        List<String> startNGram = ngrams.get(r.nextInt(ngrams.size()));

        // print some 40 word sequences, each with there own random start point
        System.out.println("--- Example sentences ---");

        List<String> sequence = model.sequence(40, startNGram);
        System.out.println(String.join(" ", sequence));

        startNGram = ngrams.get(r.nextInt(ngrams.size()));
        sequence = model.sequence(40, startNGram);
        System.out.println(String.join(" ", sequence));

        startNGram = ngrams.get(r.nextInt(ngrams.size()));
        sequence = model.sequence(40, startNGram);
        System.out.println(String.join(" ", sequence));

        System.out.println();

        // obviously can also ask for just one more ngram at a time
        System.out.println("--- One ngram at a time ---");

        List<String> ngram = model.nextNGram(startNGram);
        ngram = model.nextNGram(ngram);
        System.out.println(ngram);
        ngram = model.nextNGram(ngram);
        System.out.println(ngram);
        ngram = model.nextNGram(ngram);
        System.out.println(ngram);
    }
}

## Tokenizer.java
import java.util.List;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Tokenizer {
    public static List<String> tokenize(String s) {
        s = s.toLowerCase();

        // match lower case words or punctuation as tokens
        Matcher m = Pattern.compile("\\p{Lower}+|\\p{Punct}").matcher(s);
        List<String> tokens = new ArrayList<String>();
        while(m.find()) {
            tokens.add(m.group());
        }

        return tokens;
    }
}

## Twitter.java
import com.temboo.core.TembooSession;
import com.temboo.Library.Twitter.Timelines.UserTimeline;
import com.temboo.Library.Twitter.Timelines.UserTimeline.UserTimelineInputSet;
import com.temboo.Library.Twitter.Timelines.UserTimeline.UserTimelineResultSet;

import org.json.JSONArray;
import org.json.JSONObject;

import java.util.ArrayList;
import java.io.IOException;

public class Twitter {
    public static String getTweetsForUser(String screenName) throws IOException {
        String s = "";

        // Instantiate the Choreo, using a previously instantiated TembooSession object, eg:
        try{
            TembooSession session = new TembooSession("triss", "myFirstApp", "dsLE57s1wXRodpJqTYpUkUfY2S0iEv4i");
            UserTimeline userTimelineChoreo = new UserTimeline(session);
            UserTimelineInputSet userTimelineInputs = userTimelineChoreo.newInputSet();
            userTimelineInputs.setCredential("triss");
            userTimelineInputs.set_Count("200");
            UserTimelineResultSet userTimelineResults = userTimelineChoreo.execute(userTimelineInputs);

            JSONArray results = new JSONArray(userTimelineResults.get_Response());

            System.out.println(results.length());
            for(int i = 0; i < results.length(); i++) {
                s += results.getJSONObject(i).getString("text") + " ";
            }
        } catch(Exception e) {
            throw(new IOException("Failed to get tweets for " + screenName, e));
        }

        return s;
    }
}
	import java.util.List;
	import java.util.ArrayList;
	import java.util.Map;
	import java.util.HashMap;
	import java.util.Random;

	public class Markov<E> {
	// we'll need a random number generator
	Random random;

	// this map will contain our Markov model
	// Effectively it's a map from n-grams -> probability tables
	// n-grams have type: List<E> - they're lists of E
	// our probability tables have type: Map<E, Integer>
	// so overall the model's type is:
	Map<List<E>, Map<E, Integer>> model;

	/**
	* Constructs a new Markov model from sequence with n-gram size of n.
	*
	* @param n the size of the ngrams used by this model
	* @param sequence the source sequence used to generate our model
	*/
	public Markov(int n, List<E> sequence) {
	this(new Random(), n, sequence);
	}

	/**
	* Constructs a new Markov model from sequence with n-gram size of n.
	* A Random number generator can be specified reproducing sequences
	*
	* @param randomGen a random number generator used for navigating the model
	* @param n the size of the n-grams used by this model
	* @param sequence the source sequence used to generate our model
	*/
	public Markov(Random randomGen, int n, List<E> sequence) {
	random = randomGen;
	model = new HashMap<List<E>, Map<E, Integer>>();

	// this is where we build the model - we iterate over the sequence
	for(int i = 0; i < sequence.size() - n; i++) {
	// grab an n-gram,
	List<E> nGram = sequence.subList(i, i + n - 1);

	// then look up the probability table for the current n-gram
	Map<E, Integer> probabilities = model.get(nGram);
	if(probabilities == null) {
	probabilities = new HashMap<E, Integer>();
	}

	// grab the item following our n-gram
	E nextItem = sequence.get(i + n - 1);

	// then look up and increment the count of times the nextItem has
	// followed the current n-gram so far
	Integer count = probabilities.get(nextItem);
	if(count == null) { count = 0; }
	probabilities.put(nextItem, ++count);

	// stick the probabilities/counts in the model
	model.put(nGram, probabilities);
	}
	}

	/**
	* Given an n-gram return a possible next item
	*
	*
	* @param nGram an nGram
	* @returns an item from the model
	*/
	public E nextItem(List<E> nGram) {
	// Really simple - it's just a weighted choose on the probability table we
	// stored for each n-gram
	return weightedChoose(model.get(nGram));
	}

	/**
	* Creates a new n-gram from the passed in n-gram and a possible next item
	*
	* @param nGram an nGram
	* @returns an nGram
	*/
	public List<E> nextNGram(List<E> nGram) {
	List<E> next = new ArrayList<E>(nGram);
	next.remove(0); // remove first item
	next.add(nextItem(nGram)); // add new item to end
	return next;
	}

	/**
	* Produces a sequence using the markov model
	*
	* @param n the length of the sequence
	* @param nGram the nGram to start the sequence with
	* @returns sequence of objects from markov model
	*/
	public List<E> sequence(int n, List<E> nGram) {
	// we just call nextNGram(..) over and over again adding each item to a list
	List<E> seq = new ArrayList<E>(nGram);
	List<E> next = nGram;
	for(int i = nGram.size(); i < n; i++) {
	next = nextNGram(next);
	seq.add(next.get(next.size() - 1));
	}

	return seq;
	}

	/**
	* Randomly selects an item from a map with the weights specified as values
	*
	* Google weighted choose if you don't know the algorithm! Super useful.
	* (btw. works just as well with Double etc just Ints in this context...)
	*
	* @probabilites a map of from values to there relative probabilities
	* e.g. with a map like {"cat":1, "dog":1, "monkey":10}
	* monkey will be returned 10 times as many times as cat or dog
	* @returns a value from the probabilities map
	*/
	private E weightedChoose(Map<E, Integer> probabilities) {
	// whizz across the list of probabilities keeping running sum of them
	int runningSum = 0;
	ArrayList<Integer> summedWeights = new ArrayList<Integer>();
	for(int p : probabilities.values()) {
	runningSum += p;
	summedWeights.add(runningSum);
	}

	// choose a number 0 and the sum
	int threshold = random.nextInt(runningSum);

	// find the index of the item to first exceed this number
	int i = 0;
	while(i < probabilities.size() - 1 &&
	threshold < summedWeights.get(i)) {
	i++;
	}

	// pull the item out of the map using the index we now have
	ArrayList<E> items = new ArrayList<E>(probabilities.keySet());
	return items.get(i);
	}

	/**
	* Helper function that returns all of the n-grams used in the model
	* Useful for finding an n-gram to start a sequence with
	*
	* @returns List of n-grams
	*/
	public List<List<E>> getNGrams() {
	return new ArrayList<List<E>>(model.keySet());
	}

	/**
	* We'll just use HashMap's .toString()
	* You'll just have to squint
	*/
	public String toString() {
	return model.toString();
	}
	}
	import java.nio.file.Files;
	import java.nio.file.Paths;
	import java.io.IOException;
	import java.util.List;
	import java.util.Random;

	public class MarkovianSentences {
	public static void main(String[] args) {
	// Get the words we are going to put in our Markov model
	String string = "";
	try {
	byte[] fileBytes = Files.readAllBytes(Paths.get("testFile.txt"));
	string = new String(fileBytes);

	// note you could just as easily in your tweets here
	// string = Twitter.getTweetsForUser("realDonaldTrump");
	} catch(IOException e) {
	System.err.println(e);
	System.exit(1);
	}

	// chop our big string up in to separate words, punctuation etc
	List<String> words = Tokenizer.tokenize(string);

	// This makes a Markov model with trigrams in of our words
	// - change 3 to vary the size of the ngrams used.
	// - note that this class works with any List of Objects, not just Lists
	// of Strings
	Markov<String> model = new Markov<String>(3, words);

	// get a list of all the ngrams from our model and randomly choose one
	// to start with
	List<List<String>> ngrams = model.getNGrams();
	Random r = new Random();
	List<String> startNGram = ngrams.get(r.nextInt(ngrams.size()));

	// print some 40 word sequences, each with there own random start point
	System.out.println("--- Example sentences ---");

	List<String> sequence = model.sequence(40, startNGram);
	System.out.println(String.join(" ", sequence));

	startNGram = ngrams.get(r.nextInt(ngrams.size()));
	sequence = model.sequence(40, startNGram);
	System.out.println(String.join(" ", sequence));

	startNGram = ngrams.get(r.nextInt(ngrams.size()));
	sequence = model.sequence(40, startNGram);
	System.out.println(String.join(" ", sequence));

	System.out.println();

	// obviously can also ask for just one more ngram at a time
	System.out.println("--- One ngram at a time ---");

	List<String> ngram = model.nextNGram(startNGram);
	ngram = model.nextNGram(ngram);
	System.out.println(ngram);
	ngram = model.nextNGram(ngram);
	System.out.println(ngram);
	ngram = model.nextNGram(ngram);
	System.out.println(ngram);
	}
	}
	import java.util.List;
	import java.util.ArrayList;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	public class Tokenizer {
	public static List<String> tokenize(String s) {
	s = s.toLowerCase();

	// match lower case words or punctuation as tokens
	Matcher m = Pattern.compile("\\p{Lower}+\|\\p{Punct}").matcher(s);
	List<String> tokens = new ArrayList<String>();
	while(m.find()) {
	tokens.add(m.group());
	}

	return tokens;
	}
	}
	import com.temboo.core.TembooSession;
	import com.temboo.Library.Twitter.Timelines.UserTimeline;
	import com.temboo.Library.Twitter.Timelines.UserTimeline.UserTimelineInputSet;
	import com.temboo.Library.Twitter.Timelines.UserTimeline.UserTimelineResultSet;

	import org.json.JSONArray;
	import org.json.JSONObject;

	import java.util.ArrayList;
	import java.io.IOException;

	public class Twitter {
	public static String getTweetsForUser(String screenName) throws IOException {
	String s = "";

	// Instantiate the Choreo, using a previously instantiated TembooSession object, eg:
	try{
	TembooSession session = new TembooSession("triss", "myFirstApp", "dsLE57s1wXRodpJqTYpUkUfY2S0iEv4i");
	UserTimeline userTimelineChoreo = new UserTimeline(session);
	UserTimelineInputSet userTimelineInputs = userTimelineChoreo.newInputSet();
	userTimelineInputs.setCredential("triss");
	userTimelineInputs.set_Count("200");
	UserTimelineResultSet userTimelineResults = userTimelineChoreo.execute(userTimelineInputs);

	JSONArray results = new JSONArray(userTimelineResults.get_Response());

	System.out.println(results.length());
	for(int i = 0; i < results.length(); i++) {
	s += results.getJSONObject(i).getString("text") + " ";
	}
	} catch(Exception e) {
	throw(new IOException("Failed to get tweets for " + screenName, e));
	}

	return s;
	}
	}