Created
November 9, 2014 00:40
-
-
Save keshavsaharia/2639b9d7ad0f1da8e524 to your computer and use it in GitHub Desktop.
Mark Twain wrote it
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.File; | |
import java.io.FileNotFoundException; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.HashMap; | |
import java.util.Map; | |
import java.util.Scanner; | |
public class MarkTwainWroteIt { | |
public static void main(String[] args) throws FileNotFoundException { | |
Scanner scan = new Scanner(System.in); | |
HashMap<String, Double> wordFrequency = getWordFrequency(getFile("sample"), 0); | |
HashMap<String, Double> wordFrequency2 = getWordFrequency(getFile("huckfinn") + getFile("tomsawyer"), 1); | |
double totalDifference = 0, total = 0; | |
for (Map.Entry <String, Double> pair : wordFrequency.entrySet()) { | |
String word = pair.getKey(); | |
double freq1 = pair.getValue(); | |
if (wordFrequency2.containsKey(word)) { | |
double freq2 = wordFrequency2.get(word); | |
totalDifference = totalDifference + Math.abs(freq2 - freq1); | |
total = total + 1; | |
} | |
} | |
if (total > 0) { | |
System.out.println("Average difference: " + (totalDifference * 100 / total) + "%"); | |
} | |
else { | |
System.out.println("No common difference."); | |
} | |
} | |
private static HashMap<String, Double> getWordFrequency(String content, int smoothing) { | |
String raw = removePunctuation(content); | |
String[] words = raw.split(" "); | |
HashMap<String, Integer> word = new HashMap<String, Integer> (); | |
HashMap<String, Double> wordFrequency = new HashMap<String, Double> (); | |
for (int i = 0 ; i < words.length ; i++) { | |
if (word.containsKey(words[i])) { | |
word.put(words[i], word.get(words[i]) + 1); | |
} | |
else { | |
word.put(words[i], 1); | |
} | |
} | |
double wordCount = words.length; | |
for (Map.Entry <String, Integer> pair : word.entrySet()) { | |
if (pair.getValue() > smoothing) { | |
wordFrequency.put(pair.getKey(), pair.getValue() / wordCount); | |
} | |
} | |
return wordFrequency; | |
} | |
private static String removePunctuation(String content) { | |
content = content.replace('.', ' '); | |
content = content.replace('!', ' '); | |
content = content.replace(',', ' '); | |
content = content.replace('?', ' '); | |
content = content.replace('-', ' '); | |
content = content.replace(';', ' '); | |
content = content.replace(':', ' '); | |
content = content.replace('"', ' '); | |
content = content.replace('_', ' '); | |
content = content.replace('*', ' '); | |
content = content.replace('[', ' '); | |
content = content.replace(']', ' '); | |
content = content.replaceAll("--", " "); | |
content = content.replaceAll("\\.\\.\\.", ""); | |
content = content.replaceAll("\\s+", " "); | |
content = content.toLowerCase(); | |
content = content.trim(); | |
return content; | |
} | |
private static String getFile(String name) { | |
// Make the file object and a scanner. | |
File mt = new File(name); | |
Scanner input; | |
// Try initializing the scanner and reading from it. | |
try { | |
input = new Scanner(mt); | |
// Make a string builder to assemble the file into a string. | |
StringBuilder sb = new StringBuilder(); | |
while (input.hasNextLine()) { | |
sb.append(input.nextLine()); | |
sb.append(" "); | |
// sb.append("\n") | |
} | |
return sb.toString(); | |
} | |
// But if you can't find a file, return the empty string. | |
catch (FileNotFoundException e) { | |
return ""; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment