Created
November 6, 2017 04:27
-
-
Save annikakouhia/1cf29c0e1aa6ad2e315b7e8297b7c339 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*This was a lab I completed with my friend Morgan for our CS10 course this past quarter at Dartmouth | |
* Brown University mapped out a huge document made up of tons of sentences from books, magazines, newspapers, etc. | |
* and then made a second document where each entry was the part of speech for the corresponding entry in the first | |
* document. | |
* For this lab, we were given the first and second document from Brown, and then used the Viterbi algorithm to basically | |
* "teach" our program how to take in another document (we were given test documents with solutions for class) or user input. | |
* and output the corresponding part of speech. | |
* To deduce what each word (from the user input or from a document you put in) was in terms of parts of speech, the computer | |
* relied on two things. First, was the probability that the word was as certain POS based on what POS the word before it was, | |
* and second was the probability that the word was a certain POS based on what the word actually was (like the probability that | |
* the word 'trains' is a noun versus a verb). We trained the computer to know these probabilities by reading in and messing with | |
* the patterns in the documents from Brown, and then used Viterbi to calculate all the possible "paths" of POSs, before finding the | |
* most likely one based on our probabilities, and backtracing to print out the correct path of Parts of Speech at the end. | |
* I loved this lab because I thought it was so cool that we were able to use probabilities and algorithms to do something as | |
* human as identifying what the part of speech is for a certain word given its context. Upon turning in this lab, we were able to | |
* run through the Brown training sentences and calculate the part of speech for a given word with 97% accuracy. | |
*/ | |
//Morgan and Annika's PS5 | |
//3.3.17 | |
//POS= Part of Speech | |
import java.io.BufferedReader; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.util.*; | |
import java.math.*; | |
public class PS5 { | |
//Map that keeps track of current part of speech and a map to the next possible parts of speech and the frequencies of going to each | |
static Map<String, Map<String, Double>> POStoNextandFreq = new HashMap<String, Map<String, Double>>(); | |
//Map that keeps track of the current part of speech and a map to the observed words in this part of speech and their frequencies | |
static Map<String, Map<String, Double>> POStoWordandFreq = new HashMap<String, Map<String, Double>>(); | |
//ArrayList full of the different parts of speech read in | |
static ArrayList<String> POS = new ArrayList<String>(); | |
//Array List full of the words read in | |
static ArrayList<String> totalWords = new ArrayList<String>(); | |
//List of the best path per line | |
static List<String> bestPath = new ArrayList<String>(); | |
//Array List of Array Lists. Each individual array is a sentence held as a list of strings and the master array holds all sentences arrays | |
static List<ArrayList<String>> sampleSentence = new ArrayList<ArrayList<String>>(); | |
//our answers is the corresponding parts of speech that we got at the end... All the best paths put together into a list | |
static ArrayList<String> ourAnswers = new ArrayList<String>(); | |
//right answers is the correct answers given to us in order to test our accuracy | |
static ArrayList<String> rightAnswers = new ArrayList<String>(); | |
public static void main(String[] args){ | |
//creates POStoWordandFreq | |
createMapWords(); | |
//creates POStoNextandFreq | |
createMapNext(); | |
//OPTION 1: RUN THIS (comment out option 2) | |
//Go through each sentence and put it through the viterbi algorithm | |
for(int i =0; i< sampleSentence.size(); i++) { | |
viterbi(sampleSentence.get(i)); //calls viterbi method | |
} | |
System.out.println(corrections(rightAnswers, ourAnswers)); //prints out the result | |
//OPTION 2: RUN THIS (comment out option 1) | |
//This is the method that does the personal input instead of reading from a file | |
//personalInput(); | |
} | |
//does the personal input sentence tagging | |
public static void personalInput(){ | |
Scanner scan = new Scanner(System.in); //our scanner | |
while(scan.hasNextLine()){ | |
String input = scan.nextLine(); //line the person entered | |
if(input == "-1"){ //will keep running till u enter -1 | |
break; | |
} | |
else{ | |
ArrayList<String> sendtoviterbi = new ArrayList<String>(); //array list we will send viterbi | |
String[] allWords =input.split(" "); //splitting it by spaces | |
for(String w: allWords){ //put all words into the array list | |
sendtoviterbi.add(w); | |
} | |
viterbi(sendtoviterbi); //do viterbi | |
System.out.println(bestPath); //print the best path | |
} | |
} | |
} | |
//Method that calculates the total correct and the total wrong tags when testing from our given documents | |
public static String corrections(List<String> answerKey, List<String> test){ | |
int same = 0; | |
int diff = 0; | |
for(int i = 0; i<answerKey.size(); i++){ //goes through each tag | |
if(answerKey.get(i).equals(test.get(i))){ //if they are equal increment same | |
same+=1; | |
} | |
else{ //if they are different increment different | |
diff+=1; | |
} | |
} | |
String response = "You got " + same + " correct and " + diff + " incorrect"; //create response to print | |
return response; | |
} | |
//This is the viterbi algorithm. It takes in a sentence and tags all the letters | |
public static void viterbi(ArrayList sentence){ | |
bestPath = new ArrayList<String>(); //we will be storing the best path in this array. | |
//we will be keeping track of the back track here. The integer is the index in the sentence and the string is the next state and the second string is the current state | |
Map<Integer, HashMap<String, String>> backtrace = new HashMap<Integer, HashMap<String,String>>(); | |
//make a set called currStates which only has "#" so far | |
Set<String> currStates = new HashSet<String>(); | |
currStates.add("#"); | |
//make a map called currScores which only contains "#" so far | |
Map<String, Double> currScores = new HashMap<String,Double>(); | |
currScores.put("#", 0.0); | |
//for i from 0 to # observations - 1 | |
for(int i = 0; i<sentence.size(); i++){ | |
//starting the backtrace | |
backtrace.put(i, new HashMap<String,String>()); | |
//nextStates = empty Set | |
Set<String> nextStates = new HashSet<String>(); | |
//nextScores = empty map | |
Map<String,Double> nextScores = new HashMap<String,Double>(); | |
//for each currState in currStates | |
for(String currState: currStates){ | |
//for each transition currState -> nextState | |
for(String nextState: POStoNextandFreq.get(currState).keySet()){ | |
//add nextState to nextStates | |
nextStates.add(nextState); | |
//nextScore = currScores[currState] + // path to here | |
//transitionScore(currState -> nextState) + // take a step to there | |
//observationScore(observations[i] in nextState) // make the observation there | |
Double nextScore = currScores.get(currState) + POStoNextandFreq.get(currState).get(nextState); | |
if(POStoWordandFreq.get(nextState).containsKey(sentence.get(i))){ | |
nextScore += POStoWordandFreq.get(nextState).get(sentence.get(i)); | |
} | |
else{ | |
nextScore-=100; | |
} | |
//if nextState isn't in nextScores or nextScore is greater than the current nextScores[nextState] | |
if(!nextScores.containsKey(nextState) || nextScore>nextScores.get(nextState)){ | |
//set nextScores[nextState] to nextScore | |
nextScores.put(nextState, nextScore); | |
//remember that pred of nextState @ i is curr (change the backtrace to the better option) | |
backtrace.get(i).put(nextState, currState); | |
} | |
} | |
} | |
//update the states | |
currStates = nextStates; | |
currScores = nextScores; | |
} | |
double highest = -90000000.0; //very low number as the base case | |
String highestPOS = ""; | |
//go through each of the final scores and find the highest score and which part of speech it corresponds to | |
for(String s: currScores.keySet()){ | |
if(currScores.get(s)>highest){ | |
highest = currScores.get(s); | |
highestPOS = s; | |
} | |
} | |
bestPath.add(highestPOS); //add the part of speech to the best path | |
//for each word in the sentence, do the back tracking | |
for(int k = sentence.size()-1; k>0; k--){ | |
bestPath.add(0, backtrace.get(k).get(highestPOS)); //add the highest linked to the front of the list | |
highestPOS = backtrace.get(k).get(highestPOS); //reset highest after having it found the previous highest | |
} | |
//the best path is part of our answers because best path it tagged | |
ourAnswers.addAll(bestPath); | |
} | |
//creates the POStoNextandFreq map | |
public static void createMapNext(){ | |
POStoNextandFreq.put("#", new HashMap<String, Double>()); | |
//for each part of speech in the list part of speech | |
for(int i =0; i< POS.size(); i++){ | |
String currentPOS = POS.get(i); //get the current part of speech and store it in currentPOS | |
if(!POStoNextandFreq.get("#").containsKey(currentPOS)){ | |
POStoNextandFreq.get("#").put(currentPOS, 1.0); | |
} | |
//if there is a part of speech after it | |
if(i+1 < POS.size()){ | |
String nextPOS= POS.get(i+1); //set next part of speech to the next part of speech | |
//If the map already has the current part of speech | |
if(POStoNextandFreq.containsKey(currentPOS)){ | |
//and if the current part of speech has the next part of speech in its map already | |
if(POStoNextandFreq.get(currentPOS).containsKey(nextPOS)){ | |
//temp freq is the current frequency | |
Double tempfreq = POStoNextandFreq.get(currentPOS).get(nextPOS); | |
tempfreq++; //update the current frequency by 1 | |
//reset the value of that next part of speech compared to the current part of speech | |
POStoNextandFreq.get(currentPOS).put(nextPOS, tempfreq); | |
} | |
else{ //otherwise create a new entry in the current POS's map of next POSs and set the frequency to 1 | |
POStoNextandFreq.get(currentPOS).put(nextPOS, 1.0); | |
} | |
} | |
//if the map does not already have the current part of speech | |
else{ | |
//put the current part of speech in with a new map | |
POStoNextandFreq.put(currentPOS, new HashMap<String, Double>()); | |
//in the new map put the next part of speech in and set its frequency to 1 | |
POStoNextandFreq.get(currentPOS).put(nextPOS, 1.0); | |
} | |
} | |
} | |
//Changing "frequencies" from literal number of times seen, to the logarithms of percent of time seen to keep numbers small | |
//when going through Viterbi algorithm | |
for(String pos: POStoNextandFreq.keySet()){ | |
Double total = 0.0; //total counts the total time the part of speech was used | |
//for each part of speech that comes next from the current part of speech | |
for(String w: POStoNextandFreq.get(pos).keySet()){ | |
//add the frequency of the next part of speech | |
total+= POStoNextandFreq.get(pos).get(w); | |
} | |
//for each part of speech that comes next from the current part of speech | |
for(String w: POStoNextandFreq.get(pos).keySet()){ | |
//current frequency is the specific next part of speech divided by the total | |
double currfreq = POStoNextandFreq.get(pos).get(w)/total; | |
//we then take the log of that | |
double newfreq = Math.log(currfreq); | |
//put the log back into the map | |
POStoNextandFreq.get(pos).put(w, newfreq); | |
} | |
} | |
} | |
//This creates the POStoWordandFreq map | |
public static void createMapWords(){ | |
//puts all seen parts of speech in a list | |
createPOS(); | |
//puts all seen words in a list | |
createAllWords(); | |
//for each part of speech in the part of speech list | |
for(int i= 0; i<POS.size(); i++){ | |
//this is the current part of speech | |
String currentPOS = POS.get(i); | |
//If the map has the current part of speech | |
if(POStoWordandFreq.containsKey(currentPOS)){ | |
if(i+1 < totalWords.size()){ //if this isn't the last POS/Word combo in our training files | |
//if the part of speech contains the current word already | |
if(POStoWordandFreq.get(currentPOS).containsKey(totalWords.get(i))){ | |
//get the current word frequency | |
double curr = POStoWordandFreq.get(currentPOS).get(totalWords.get(i)); | |
curr++; //increase the frequency by 1.0 | |
POStoWordandFreq.get(currentPOS).put(totalWords.get(i), curr); //update the frequency | |
} | |
//else this POS has never been associated with the corresponding word, so put frequency to 1 | |
else{ | |
POStoWordandFreq.get(currentPOS).put(totalWords.get(i), 1.0); | |
} | |
} | |
} | |
//otherwise, if the part of speech isn't already in the map | |
else{ | |
//add the part of speech and create a new "sub"map (since this is a map that has a second map as its key) | |
POStoWordandFreq.put(currentPOS, new HashMap<String, Double>()); | |
//add the current word to that part of speech and set its frequency to 1 | |
POStoWordandFreq.get(currentPOS).put(totalWords.get(i), 1.0); | |
} | |
} | |
//Changing "frequencies" from literal number of times seen, to the logarithms of percent of time seen to keep numbers small | |
//when going through Viterbi algorithm | |
for(String pos: POStoWordandFreq.keySet()){ | |
Double total = 0.0; //this is used to calculate the total | |
//for each of the words in the part of speech | |
for(String w: POStoWordandFreq.get(pos).keySet()){ | |
total+= POStoWordandFreq.get(pos).get(w); //update the total | |
} | |
//for each of the words in the part of speech | |
for(String w: POStoWordandFreq.get(pos).keySet()){ | |
//update current frequency by dividing by the total | |
double currfreq = POStoWordandFreq.get(pos).get(w)/total; | |
//new frequency is the log of that because don't want values to be too small | |
double newfreq = Math.log(currfreq); | |
//reset the value with the log | |
POStoWordandFreq.get(pos).put(w, newfreq); | |
} | |
} | |
} | |
//Fills in our list of words from a file we have been given | |
public static void createAllWords(){ | |
BufferedReader read; | |
try{ | |
read = new BufferedReader(new FileReader("inputs/simple-train-sentences.txt")); | |
//word is the next line. Line consists of a whole sentence | |
String word = read.readLine(); | |
while(word !=null){ //while word is a word and not null | |
//make an array list of the current words | |
ArrayList<String> curr = new ArrayList<String>(); | |
//split all the words by spaces | |
String[] allWords =word.split(" "); | |
//for all of the words in the current word | |
for(String w: allWords){ | |
curr.add(w.toLowerCase()); //make them lower case to avoid errors | |
totalWords.add(w.toLowerCase()); //adding to total words because that will be used in making the word map | |
} | |
sampleSentence.add(curr); //adding curr to array list of lists of each sentence so we can traverse a document sentence by sentence instead of as an entire document | |
word = read.readLine(); //read in next line | |
} | |
read.close(); //close the reader so that empty file doesn't come back to crash this lab when we only have a few more hours to finish it | |
} | |
catch(IOException e){ | |
System.out.print("Can't open file"); | |
} | |
} | |
//This method fills a list of all the Part of Speech read in from a text document. More detailed comments in method above they are pretty similar | |
public static void createPOS(){ | |
BufferedReader read; | |
try{ //try catch in case of file errors | |
read = new BufferedReader(new FileReader("inputs/simple-train-tags.txt")); //brown-train-tags given to us | |
String word = read.readLine();// reading in the line | |
while(word !=null){ //go through each line and add each specific word split by spaces | |
String[] allWords = word.split(" "); | |
for(String w: allWords){ | |
POS.add(w); | |
rightAnswers.add(w); | |
} | |
word = read.readLine(); | |
} | |
read.close(); | |
} | |
//exception | |
catch(IOException e){ | |
System.out.print("Can't open file"); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment