Skip to content

Instantly share code, notes, and snippets.

@keshavsaharia
Created November 9, 2014 00:40
Show Gist options
  • Save keshavsaharia/2639b9d7ad0f1da8e524 to your computer and use it in GitHub Desktop.
Save keshavsaharia/2639b9d7ad0f1da8e524 to your computer and use it in GitHub Desktop.
Mark Twain wrote it
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;
public class MarkTwainWroteIt {
public static void main(String[] args) throws FileNotFoundException {
Scanner scan = new Scanner(System.in);
HashMap<String, Double> wordFrequency = getWordFrequency(getFile("sample"), 0);
HashMap<String, Double> wordFrequency2 = getWordFrequency(getFile("huckfinn") + getFile("tomsawyer"), 1);
double totalDifference = 0, total = 0;
for (Map.Entry <String, Double> pair : wordFrequency.entrySet()) {
String word = pair.getKey();
double freq1 = pair.getValue();
if (wordFrequency2.containsKey(word)) {
double freq2 = wordFrequency2.get(word);
totalDifference = totalDifference + Math.abs(freq2 - freq1);
total = total + 1;
}
}
if (total > 0) {
System.out.println("Average difference: " + (totalDifference * 100 / total) + "%");
}
else {
System.out.println("No common difference.");
}
}
private static HashMap<String, Double> getWordFrequency(String content, int smoothing) {
String raw = removePunctuation(content);
String[] words = raw.split(" ");
HashMap<String, Integer> word = new HashMap<String, Integer> ();
HashMap<String, Double> wordFrequency = new HashMap<String, Double> ();
for (int i = 0 ; i < words.length ; i++) {
if (word.containsKey(words[i])) {
word.put(words[i], word.get(words[i]) + 1);
}
else {
word.put(words[i], 1);
}
}
double wordCount = words.length;
for (Map.Entry <String, Integer> pair : word.entrySet()) {
if (pair.getValue() > smoothing) {
wordFrequency.put(pair.getKey(), pair.getValue() / wordCount);
}
}
return wordFrequency;
}
private static String removePunctuation(String content) {
content = content.replace('.', ' ');
content = content.replace('!', ' ');
content = content.replace(',', ' ');
content = content.replace('?', ' ');
content = content.replace('-', ' ');
content = content.replace(';', ' ');
content = content.replace(':', ' ');
content = content.replace('"', ' ');
content = content.replace('_', ' ');
content = content.replace('*', ' ');
content = content.replace('[', ' ');
content = content.replace(']', ' ');
content = content.replaceAll("--", " ");
content = content.replaceAll("\\.\\.\\.", "");
content = content.replaceAll("\\s+", " ");
content = content.toLowerCase();
content = content.trim();
return content;
}
private static String getFile(String name) {
// Make the file object and a scanner.
File mt = new File(name);
Scanner input;
// Try initializing the scanner and reading from it.
try {
input = new Scanner(mt);
// Make a string builder to assemble the file into a string.
StringBuilder sb = new StringBuilder();
while (input.hasNextLine()) {
sb.append(input.nextLine());
sb.append(" ");
// sb.append("\n")
}
return sb.toString();
}
// But if you can't find a file, return the empty string.
catch (FileNotFoundException e) {
return "";
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment