Skip to content

Instantly share code, notes, and snippets.

@saptak
Created September 1, 2015 14:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save saptak/b89d5d28702f3cfcc833 to your computer and use it in GitHub Desktop.
Save saptak/b89d5d28702f3cfcc833 to your computer and use it in GitHub Desktop.
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.lang.reflect.Array;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.*;
public class MP1 {
Random generator;
String userName;
String inputFileName;
String delimiters = " \t,;.?!-:@[](){}_*/";
String[] stopWordsArray = {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
"yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its",
"itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
"these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having",
"do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while",
"of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before",
"after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
"further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
"few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
"too", "very", "s", "t", "can", "will", "just", "don", "should", "now"};
void initialRandomGenerator(String seed) throws NoSuchAlgorithmException {
MessageDigest messageDigest = MessageDigest.getInstance("SHA");
messageDigest.update(seed.toLowerCase().trim().getBytes());
byte[] seedMD5 = messageDigest.digest();
long longSeed = 0;
for (int i = 0; i < seedMD5.length; i++) {
longSeed += ((long) seedMD5[i] & 0xffL) << (8 * i);
}
this.generator = new Random(longSeed);
}
Integer[] getIndexes() throws NoSuchAlgorithmException {
Integer n = 10000;
Integer number_of_lines = 50000;
Integer[] ret = new Integer[n];
this.initialRandomGenerator(this.userName);
for (int i = 0; i < n; i++) {
ret[i] = generator.nextInt(number_of_lines);
}
return ret;
}
public MP1(String userName, String inputFileName) {
this.userName = userName;
this.inputFileName = inputFileName;
}
private List<String> readFile(String filename)
{
List<String> records = new ArrayList<String>();
try
{
BufferedReader reader = new BufferedReader(new FileReader(filename));
String line;
Integer[] valLines = this.getIndexes();
while ((line = reader.readLine()) != null)
{
records.add(line);
}
reader.close();
List<String> recordsSmall = new ArrayList<String>();
for(int n : valLines){
recordsSmall.add(records.get(n));
}
return recordsSmall;
}
catch (Exception e)
{
System.err.format("Exception occurred trying to read '%s'.", filename);
e.printStackTrace();
return null;
}
}
@SuppressWarnings({ "unchecked", "rawtypes" })
public static Map sortByDescValueAscKey(Map unsortMap) {
List list = new LinkedList(unsortMap.entrySet());
Collections.sort(list, new Comparator() {
//reversed order of parameter for descending.
public int compare(Object o2, Object o1) {
if(((Map.Entry) (o1)).getValue()==((Map.Entry) (o2)).getValue()){
return ((Comparable) ((Map.Entry) (o2)).getKey())
.compareTo(((Map.Entry) (o1)).getKey());
}else{
return ((Comparable) ((Map.Entry) (o1)).getValue())
.compareTo(((Map.Entry) (o2)).getValue());
}
}
});
Map sortedMap = new LinkedHashMap();
for (Iterator it = list.iterator(); it.hasNext();) {
Map.Entry entry = (Map.Entry) it.next();
sortedMap.put(entry.getKey(), entry.getValue());
}
return sortedMap;
}
@SuppressWarnings({ "rawtypes", "unchecked" })
public String[] process() throws Exception {
String[] ret = new String[20];
//TODO
//Open File
List<String> lines = this.readFile(this.inputFileName);
List<String> words = new ArrayList();
//Divide each sentence into a list of words using delimiters provided in the "delimiters" variable
for (String line : lines) {
StringTokenizer st = new StringTokenizer(line,delimiters);
while (st.hasMoreElements()) {
String word = st.nextElement().toString().trim().toLowerCase();
if(!Arrays.asList(stopWordsArray).contains(word)){
words.add(word);
}
}
}
Collections.sort(words);
String word = "";
int freq = 0;
HashMap<String, Integer> groupedWords = new HashMap();
for(String entry: words){
String newWord = entry;
int newNumber = 1;
if(word.equals(newWord)){
freq+= newNumber;
}
else
{
groupedWords.put(word, freq);
freq = newNumber; word = newWord;
}
}
groupedWords.put(word, freq);
Map<String, Integer> descGroupedWords = sortByDescValueAscKey(groupedWords);
int n=0;
for(Map.Entry<String, Integer> entry: descGroupedWords.entrySet()){
ret[n]=entry.getKey();
n++;
if(n==20)break;
}
return ret;
}
public static void main(String[] args) throws Exception {
if (args.length < 1){
System.out.println("MP1 <User ID>");
}
else {
String userName = args[0];
String inputFileName = "./input.txt";
MP1 mp = new MP1(userName, inputFileName);
String[] topItems = mp.process();
for (String item: topItems){
System.out.println(item);
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment