Skip to content

Instantly share code, notes, and snippets.

@triss
Last active February 7, 2017 13:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save triss/d8cb2104f94b14540a57690a5aa04471 to your computer and use it in GitHub Desktop.
Save triss/d8cb2104f94b14540a57690a5aa04471 to your computer and use it in GitHub Desktop.
Markov models

Markov implimented using a map instead of matrix

Differs from Colin's description in that it:

  • works with n-grams of any size
  • operates on Objects of any type

The Markov and Tokenizer classes will work in Processing if you copy and paste classes in to a new tab in the IDE and delete some of the import statements from the top of the files. (Processing has already imported a lot of stuff).

The logic used in MarkovianSentances main method could be ported straight across into Processing's draw()/setup() methods.

Twitter and Temboo

If you want to use the Twitter class it requires that the Temboo library is availiable on the projects class path.

In Processing this can be achieved by installing Temboo using the Sketch -> Import library... -> Add library... menu option, and then adding it to your sketch by selecting Sketch -> Import library... -> Temboo.

If you aren't using Processing the jar you need to add to your Java project can be found here: https://temboo.com/java

Temboo is a friendly wrapper for Facebook, Twitter, Weather services, Github and all sorts of other web services. It makes consuming them much easier. Highly recommend getting familiar if you want an easy way to get data off the internet.

import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.util.HashMap;
import java.util.Random;
public class Markov<E> {
// we'll need a random number generator
Random random;
// this map will contain our Markov model
// Effectively it's a map from n-grams -> probability tables
// n-grams have type: List<E> - they're lists of E
// our probability tables have type: Map<E, Integer>
// so overall the model's type is:
Map<List<E>, Map<E, Integer>> model;
/**
* Constructs a new Markov model from sequence with n-gram size of n.
*
* @param n the size of the ngrams used by this model
* @param sequence the source sequence used to generate our model
*/
public Markov(int n, List<E> sequence) {
this(new Random(), n, sequence);
}
/**
* Constructs a new Markov model from sequence with n-gram size of n.
* A Random number generator can be specified reproducing sequences
*
* @param randomGen a random number generator used for navigating the model
* @param n the size of the n-grams used by this model
* @param sequence the source sequence used to generate our model
*/
public Markov(Random randomGen, int n, List<E> sequence) {
random = randomGen;
model = new HashMap<List<E>, Map<E, Integer>>();
// this is where we build the model - we iterate over the sequence
for(int i = 0; i < sequence.size() - n; i++) {
// grab an n-gram,
List<E> nGram = sequence.subList(i, i + n - 1);
// then look up the probability table for the current n-gram
Map<E, Integer> probabilities = model.get(nGram);
if(probabilities == null) {
probabilities = new HashMap<E, Integer>();
}
// grab the item following our n-gram
E nextItem = sequence.get(i + n - 1);
// then look up and increment the count of times the nextItem has
// followed the current n-gram so far
Integer count = probabilities.get(nextItem);
if(count == null) { count = 0; }
probabilities.put(nextItem, ++count);
// stick the probabilities/counts in the model
model.put(nGram, probabilities);
}
}
/**
* Given an n-gram return a possible next item
*
*
* @param nGram an nGram
* @returns an item from the model
*/
public E nextItem(List<E> nGram) {
// Really simple - it's just a weighted choose on the probability table we
// stored for each n-gram
return weightedChoose(model.get(nGram));
}
/**
* Creates a new n-gram from the passed in n-gram and a possible next item
*
* @param nGram an nGram
* @returns an nGram
*/
public List<E> nextNGram(List<E> nGram) {
List<E> next = new ArrayList<E>(nGram);
next.remove(0); // remove first item
next.add(nextItem(nGram)); // add new item to end
return next;
}
/**
* Produces a sequence using the markov model
*
* @param n the length of the sequence
* @param nGram the nGram to start the sequence with
* @returns sequence of objects from markov model
*/
public List<E> sequence(int n, List<E> nGram) {
// we just call nextNGram(..) over and over again adding each item to a list
List<E> seq = new ArrayList<E>(nGram);
List<E> next = nGram;
for(int i = nGram.size(); i < n; i++) {
next = nextNGram(next);
seq.add(next.get(next.size() - 1));
}
return seq;
}
/**
* Randomly selects an item from a map with the weights specified as values
*
* Google weighted choose if you don't know the algorithm! Super useful.
* (btw. works just as well with Double etc just Ints in this context...)
*
* @probabilites a map of from values to there relative probabilities
* e.g. with a map like {"cat":1, "dog":1, "monkey":10}
* monkey will be returned 10 times as many times as cat or dog
* @returns a value from the probabilities map
*/
private E weightedChoose(Map<E, Integer> probabilities) {
// whizz across the list of probabilities keeping running sum of them
int runningSum = 0;
ArrayList<Integer> summedWeights = new ArrayList<Integer>();
for(int p : probabilities.values()) {
runningSum += p;
summedWeights.add(runningSum);
}
// choose a number 0 and the sum
int threshold = random.nextInt(runningSum);
// find the index of the item to first exceed this number
int i = 0;
while(i < probabilities.size() - 1 &&
threshold < summedWeights.get(i)) {
i++;
}
// pull the item out of the map using the index we now have
ArrayList<E> items = new ArrayList<E>(probabilities.keySet());
return items.get(i);
}
/**
* Helper function that returns all of the n-grams used in the model
* Useful for finding an n-gram to start a sequence with
*
* @returns List of n-grams
*/
public List<List<E>> getNGrams() {
return new ArrayList<List<E>>(model.keySet());
}
/**
* We'll just use HashMap's .toString()
* You'll just have to squint
*/
public String toString() {
return model.toString();
}
}
import java.nio.file.Files;
import java.nio.file.Paths;
import java.io.IOException;
import java.util.List;
import java.util.Random;
public class MarkovianSentences {
public static void main(String[] args) {
// Get the words we are going to put in our Markov model
String string = "";
try {
byte[] fileBytes = Files.readAllBytes(Paths.get("testFile.txt"));
string = new String(fileBytes);
// note you could just as easily in your tweets here
// string = Twitter.getTweetsForUser("realDonaldTrump");
} catch(IOException e) {
System.err.println(e);
System.exit(1);
}
// chop our big string up in to separate words, punctuation etc
List<String> words = Tokenizer.tokenize(string);
// This makes a Markov model with trigrams in of our words
// - change 3 to vary the size of the ngrams used.
// - note that this class works with any List of Objects, not just Lists
// of Strings
Markov<String> model = new Markov<String>(3, words);
// get a list of all the ngrams from our model and randomly choose one
// to start with
List<List<String>> ngrams = model.getNGrams();
Random r = new Random();
List<String> startNGram = ngrams.get(r.nextInt(ngrams.size()));
// print some 40 word sequences, each with there own random start point
System.out.println("--- Example sentences ---");
List<String> sequence = model.sequence(40, startNGram);
System.out.println(String.join(" ", sequence));
startNGram = ngrams.get(r.nextInt(ngrams.size()));
sequence = model.sequence(40, startNGram);
System.out.println(String.join(" ", sequence));
startNGram = ngrams.get(r.nextInt(ngrams.size()));
sequence = model.sequence(40, startNGram);
System.out.println(String.join(" ", sequence));
System.out.println();
// obviously can also ask for just one more ngram at a time
System.out.println("--- One ngram at a time ---");
List<String> ngram = model.nextNGram(startNGram);
ngram = model.nextNGram(ngram);
System.out.println(ngram);
ngram = model.nextNGram(ngram);
System.out.println(ngram);
ngram = model.nextNGram(ngram);
System.out.println(ngram);
}
}
import java.util.List;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Tokenizer {
public static List<String> tokenize(String s) {
s = s.toLowerCase();
// match lower case words or punctuation as tokens
Matcher m = Pattern.compile("\\p{Lower}+|\\p{Punct}").matcher(s);
List<String> tokens = new ArrayList<String>();
while(m.find()) {
tokens.add(m.group());
}
return tokens;
}
}
import com.temboo.core.TembooSession;
import com.temboo.Library.Twitter.Timelines.UserTimeline;
import com.temboo.Library.Twitter.Timelines.UserTimeline.UserTimelineInputSet;
import com.temboo.Library.Twitter.Timelines.UserTimeline.UserTimelineResultSet;
import org.json.JSONArray;
import org.json.JSONObject;
import java.util.ArrayList;
import java.io.IOException;
public class Twitter {
public static String getTweetsForUser(String screenName) throws IOException {
String s = "";
// Instantiate the Choreo, using a previously instantiated TembooSession object, eg:
try{
TembooSession session = new TembooSession("triss", "myFirstApp", "dsLE57s1wXRodpJqTYpUkUfY2S0iEv4i");
UserTimeline userTimelineChoreo = new UserTimeline(session);
UserTimelineInputSet userTimelineInputs = userTimelineChoreo.newInputSet();
userTimelineInputs.setCredential("triss");
userTimelineInputs.set_Count("200");
UserTimelineResultSet userTimelineResults = userTimelineChoreo.execute(userTimelineInputs);
JSONArray results = new JSONArray(userTimelineResults.get_Response());
System.out.println(results.length());
for(int i = 0; i < results.length(); i++) {
s += results.getJSONObject(i).getString("text") + " ";
}
} catch(Exception e) {
throw(new IOException("Failed to get tweets for " + screenName, e));
}
return s;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment