Skip to content

Instantly share code, notes, and snippets.

@annikakouhia
Created November 6, 2017 04:27
Show Gist options
  • Save annikakouhia/1cf29c0e1aa6ad2e315b7e8297b7c339 to your computer and use it in GitHub Desktop.
Save annikakouhia/1cf29c0e1aa6ad2e315b7e8297b7c339 to your computer and use it in GitHub Desktop.
/*This was a lab I completed with my friend Morgan for our CS10 course this past quarter at Dartmouth
* Brown University mapped out a huge document made up of tons of sentences from books, magazines, newspapers, etc.
* and then made a second document where each entry was the part of speech for the corresponding entry in the first
* document.
* For this lab, we were given the first and second document from Brown, and then used the Viterbi algorithm to basically
* "teach" our program how to take in another document (we were given test documents with solutions for class) or user input.
* and output the corresponding part of speech.
* To deduce what each word (from the user input or from a document you put in) was in terms of parts of speech, the computer
* relied on two things. First, was the probability that the word was as certain POS based on what POS the word before it was,
* and second was the probability that the word was a certain POS based on what the word actually was (like the probability that
* the word 'trains' is a noun versus a verb). We trained the computer to know these probabilities by reading in and messing with
* the patterns in the documents from Brown, and then used Viterbi to calculate all the possible "paths" of POSs, before finding the
* most likely one based on our probabilities, and backtracing to print out the correct path of Parts of Speech at the end.
* I loved this lab because I thought it was so cool that we were able to use probabilities and algorithms to do something as
* human as identifying what the part of speech is for a certain word given its context. Upon turning in this lab, we were able to
* run through the Brown training sentences and calculate the part of speech for a given word with 97% accuracy.
*/
//Morgan and Annika's PS5
//3.3.17
//POS= Part of Speech
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
import java.math.*;
public class PS5 {
//Map that keeps track of current part of speech and a map to the next possible parts of speech and the frequencies of going to each
static Map<String, Map<String, Double>> POStoNextandFreq = new HashMap<String, Map<String, Double>>();
//Map that keeps track of the current part of speech and a map to the observed words in this part of speech and their frequencies
static Map<String, Map<String, Double>> POStoWordandFreq = new HashMap<String, Map<String, Double>>();
//ArrayList full of the different parts of speech read in
static ArrayList<String> POS = new ArrayList<String>();
//Array List full of the words read in
static ArrayList<String> totalWords = new ArrayList<String>();
//List of the best path per line
static List<String> bestPath = new ArrayList<String>();
//Array List of Array Lists. Each individual array is a sentence held as a list of strings and the master array holds all sentences arrays
static List<ArrayList<String>> sampleSentence = new ArrayList<ArrayList<String>>();
//our answers is the corresponding parts of speech that we got at the end... All the best paths put together into a list
static ArrayList<String> ourAnswers = new ArrayList<String>();
//right answers is the correct answers given to us in order to test our accuracy
static ArrayList<String> rightAnswers = new ArrayList<String>();
public static void main(String[] args){
//creates POStoWordandFreq
createMapWords();
//creates POStoNextandFreq
createMapNext();
//OPTION 1: RUN THIS (comment out option 2)
//Go through each sentence and put it through the viterbi algorithm
for(int i =0; i< sampleSentence.size(); i++) {
viterbi(sampleSentence.get(i)); //calls viterbi method
}
System.out.println(corrections(rightAnswers, ourAnswers)); //prints out the result
//OPTION 2: RUN THIS (comment out option 1)
//This is the method that does the personal input instead of reading from a file
//personalInput();
}
//does the personal input sentence tagging
public static void personalInput(){
Scanner scan = new Scanner(System.in); //our scanner
while(scan.hasNextLine()){
String input = scan.nextLine(); //line the person entered
if(input == "-1"){ //will keep running till u enter -1
break;
}
else{
ArrayList<String> sendtoviterbi = new ArrayList<String>(); //array list we will send viterbi
String[] allWords =input.split(" "); //splitting it by spaces
for(String w: allWords){ //put all words into the array list
sendtoviterbi.add(w);
}
viterbi(sendtoviterbi); //do viterbi
System.out.println(bestPath); //print the best path
}
}
}
//Method that calculates the total correct and the total wrong tags when testing from our given documents
public static String corrections(List<String> answerKey, List<String> test){
int same = 0;
int diff = 0;
for(int i = 0; i<answerKey.size(); i++){ //goes through each tag
if(answerKey.get(i).equals(test.get(i))){ //if they are equal increment same
same+=1;
}
else{ //if they are different increment different
diff+=1;
}
}
String response = "You got " + same + " correct and " + diff + " incorrect"; //create response to print
return response;
}
//This is the viterbi algorithm. It takes in a sentence and tags all the letters
public static void viterbi(ArrayList sentence){
bestPath = new ArrayList<String>(); //we will be storing the best path in this array.
//we will be keeping track of the back track here. The integer is the index in the sentence and the string is the next state and the second string is the current state
Map<Integer, HashMap<String, String>> backtrace = new HashMap<Integer, HashMap<String,String>>();
//make a set called currStates which only has "#" so far
Set<String> currStates = new HashSet<String>();
currStates.add("#");
//make a map called currScores which only contains "#" so far
Map<String, Double> currScores = new HashMap<String,Double>();
currScores.put("#", 0.0);
//for i from 0 to # observations - 1
for(int i = 0; i<sentence.size(); i++){
//starting the backtrace
backtrace.put(i, new HashMap<String,String>());
//nextStates = empty Set
Set<String> nextStates = new HashSet<String>();
//nextScores = empty map
Map<String,Double> nextScores = new HashMap<String,Double>();
//for each currState in currStates
for(String currState: currStates){
//for each transition currState -> nextState
for(String nextState: POStoNextandFreq.get(currState).keySet()){
//add nextState to nextStates
nextStates.add(nextState);
//nextScore = currScores[currState] + // path to here
//transitionScore(currState -> nextState) + // take a step to there
//observationScore(observations[i] in nextState) // make the observation there
Double nextScore = currScores.get(currState) + POStoNextandFreq.get(currState).get(nextState);
if(POStoWordandFreq.get(nextState).containsKey(sentence.get(i))){
nextScore += POStoWordandFreq.get(nextState).get(sentence.get(i));
}
else{
nextScore-=100;
}
//if nextState isn't in nextScores or nextScore is greater than the current nextScores[nextState]
if(!nextScores.containsKey(nextState) || nextScore>nextScores.get(nextState)){
//set nextScores[nextState] to nextScore
nextScores.put(nextState, nextScore);
//remember that pred of nextState @ i is curr (change the backtrace to the better option)
backtrace.get(i).put(nextState, currState);
}
}
}
//update the states
currStates = nextStates;
currScores = nextScores;
}
double highest = -90000000.0; //very low number as the base case
String highestPOS = "";
//go through each of the final scores and find the highest score and which part of speech it corresponds to
for(String s: currScores.keySet()){
if(currScores.get(s)>highest){
highest = currScores.get(s);
highestPOS = s;
}
}
bestPath.add(highestPOS); //add the part of speech to the best path
//for each word in the sentence, do the back tracking
for(int k = sentence.size()-1; k>0; k--){
bestPath.add(0, backtrace.get(k).get(highestPOS)); //add the highest linked to the front of the list
highestPOS = backtrace.get(k).get(highestPOS); //reset highest after having it found the previous highest
}
//the best path is part of our answers because best path it tagged
ourAnswers.addAll(bestPath);
}
//creates the POStoNextandFreq map
public static void createMapNext(){
POStoNextandFreq.put("#", new HashMap<String, Double>());
//for each part of speech in the list part of speech
for(int i =0; i< POS.size(); i++){
String currentPOS = POS.get(i); //get the current part of speech and store it in currentPOS
if(!POStoNextandFreq.get("#").containsKey(currentPOS)){
POStoNextandFreq.get("#").put(currentPOS, 1.0);
}
//if there is a part of speech after it
if(i+1 < POS.size()){
String nextPOS= POS.get(i+1); //set next part of speech to the next part of speech
//If the map already has the current part of speech
if(POStoNextandFreq.containsKey(currentPOS)){
//and if the current part of speech has the next part of speech in its map already
if(POStoNextandFreq.get(currentPOS).containsKey(nextPOS)){
//temp freq is the current frequency
Double tempfreq = POStoNextandFreq.get(currentPOS).get(nextPOS);
tempfreq++; //update the current frequency by 1
//reset the value of that next part of speech compared to the current part of speech
POStoNextandFreq.get(currentPOS).put(nextPOS, tempfreq);
}
else{ //otherwise create a new entry in the current POS's map of next POSs and set the frequency to 1
POStoNextandFreq.get(currentPOS).put(nextPOS, 1.0);
}
}
//if the map does not already have the current part of speech
else{
//put the current part of speech in with a new map
POStoNextandFreq.put(currentPOS, new HashMap<String, Double>());
//in the new map put the next part of speech in and set its frequency to 1
POStoNextandFreq.get(currentPOS).put(nextPOS, 1.0);
}
}
}
//Changing "frequencies" from literal number of times seen, to the logarithms of percent of time seen to keep numbers small
//when going through Viterbi algorithm
for(String pos: POStoNextandFreq.keySet()){
Double total = 0.0; //total counts the total time the part of speech was used
//for each part of speech that comes next from the current part of speech
for(String w: POStoNextandFreq.get(pos).keySet()){
//add the frequency of the next part of speech
total+= POStoNextandFreq.get(pos).get(w);
}
//for each part of speech that comes next from the current part of speech
for(String w: POStoNextandFreq.get(pos).keySet()){
//current frequency is the specific next part of speech divided by the total
double currfreq = POStoNextandFreq.get(pos).get(w)/total;
//we then take the log of that
double newfreq = Math.log(currfreq);
//put the log back into the map
POStoNextandFreq.get(pos).put(w, newfreq);
}
}
}
//This creates the POStoWordandFreq map
public static void createMapWords(){
//puts all seen parts of speech in a list
createPOS();
//puts all seen words in a list
createAllWords();
//for each part of speech in the part of speech list
for(int i= 0; i<POS.size(); i++){
//this is the current part of speech
String currentPOS = POS.get(i);
//If the map has the current part of speech
if(POStoWordandFreq.containsKey(currentPOS)){
if(i+1 < totalWords.size()){ //if this isn't the last POS/Word combo in our training files
//if the part of speech contains the current word already
if(POStoWordandFreq.get(currentPOS).containsKey(totalWords.get(i))){
//get the current word frequency
double curr = POStoWordandFreq.get(currentPOS).get(totalWords.get(i));
curr++; //increase the frequency by 1.0
POStoWordandFreq.get(currentPOS).put(totalWords.get(i), curr); //update the frequency
}
//else this POS has never been associated with the corresponding word, so put frequency to 1
else{
POStoWordandFreq.get(currentPOS).put(totalWords.get(i), 1.0);
}
}
}
//otherwise, if the part of speech isn't already in the map
else{
//add the part of speech and create a new "sub"map (since this is a map that has a second map as its key)
POStoWordandFreq.put(currentPOS, new HashMap<String, Double>());
//add the current word to that part of speech and set its frequency to 1
POStoWordandFreq.get(currentPOS).put(totalWords.get(i), 1.0);
}
}
//Changing "frequencies" from literal number of times seen, to the logarithms of percent of time seen to keep numbers small
//when going through Viterbi algorithm
for(String pos: POStoWordandFreq.keySet()){
Double total = 0.0; //this is used to calculate the total
//for each of the words in the part of speech
for(String w: POStoWordandFreq.get(pos).keySet()){
total+= POStoWordandFreq.get(pos).get(w); //update the total
}
//for each of the words in the part of speech
for(String w: POStoWordandFreq.get(pos).keySet()){
//update current frequency by dividing by the total
double currfreq = POStoWordandFreq.get(pos).get(w)/total;
//new frequency is the log of that because don't want values to be too small
double newfreq = Math.log(currfreq);
//reset the value with the log
POStoWordandFreq.get(pos).put(w, newfreq);
}
}
}
//Fills in our list of words from a file we have been given
public static void createAllWords(){
BufferedReader read;
try{
read = new BufferedReader(new FileReader("inputs/simple-train-sentences.txt"));
//word is the next line. Line consists of a whole sentence
String word = read.readLine();
while(word !=null){ //while word is a word and not null
//make an array list of the current words
ArrayList<String> curr = new ArrayList<String>();
//split all the words by spaces
String[] allWords =word.split(" ");
//for all of the words in the current word
for(String w: allWords){
curr.add(w.toLowerCase()); //make them lower case to avoid errors
totalWords.add(w.toLowerCase()); //adding to total words because that will be used in making the word map
}
sampleSentence.add(curr); //adding curr to array list of lists of each sentence so we can traverse a document sentence by sentence instead of as an entire document
word = read.readLine(); //read in next line
}
read.close(); //close the reader so that empty file doesn't come back to crash this lab when we only have a few more hours to finish it
}
catch(IOException e){
System.out.print("Can't open file");
}
}
//This method fills a list of all the Part of Speech read in from a text document. More detailed comments in method above they are pretty similar
public static void createPOS(){
BufferedReader read;
try{ //try catch in case of file errors
read = new BufferedReader(new FileReader("inputs/simple-train-tags.txt")); //brown-train-tags given to us
String word = read.readLine();// reading in the line
while(word !=null){ //go through each line and add each specific word split by spaces
String[] allWords = word.split(" ");
for(String w: allWords){
POS.add(w);
rightAnswers.add(w);
}
word = read.readLine();
}
read.close();
}
//exception
catch(IOException e){
System.out.print("Can't open file");
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment