Skip to content

Instantly share code, notes, and snippets.

@arthurnn
Created December 3, 2011 01:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arthurnn/1425667 to your computer and use it in GitHub Desktop.
Save arthurnn/1425667 to your computer and use it in GitHub Desktop.
This is the main file for Kik
package com.arthurnn;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
public class Kik {
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
final HashMap<String, Integer> _words_count = new HashMap<String, Integer>();
String URL = "http://rss.cbc.ca/lineup/topstories.xml";
DocumentBuilderFactory domFactory = DocumentBuilderFactory
.newInstance();
DocumentBuilder builder = domFactory.newDocumentBuilder();
Document doc = builder.parse(URL);
XPath xpath = XPathFactory.newInstance().newXPath();
// XPath Query for showing all nodes value
XPathExpression expr = xpath.compile("/rss/channel/item/link/text()");
Object result = expr.evaluate(doc, XPathConstants.NODESET);
NodeList nodes = (NodeList) result;
List<Thread> threads = new ArrayList<Thread>();
for (int i = 0; i < nodes.getLength(); i++) {
String link = nodes.item(i).getNodeValue();
MapReduceThread thread = new MapReduceThread(link, _words_count);
thread.start();
//thread.join();
threads.add(thread);
}
for (Thread thread : threads) {
thread.join();
}
Comparator<Entry<String,Integer>> comparator = new Comparator<Entry<String,Integer>>()
{
@Override
public int compare(Entry<String, Integer> arg0,
Entry<String, Integer> arg1) {
return arg1.getValue().compareTo(arg0.getValue());
}
};
List<Entry<String,Integer>> entries = new ArrayList<Entry<String,Integer>>();
entries.addAll(_words_count.entrySet());
Collections.sort(entries, comparator);
// System.out.println(entries.size());
int c = 0;
for (Entry<String,Integer> e : entries) {
if (c >= 50) break;
System.out.println(++c + " " + e.getValue() + " " + e.getKey());
}
}
}
package com.arthurnn;
import java.util.HashMap;
import java.util.Map.Entry;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
public class MapReduceThread extends Thread {
private String link = null;
private HashMap<String, Integer> _words_count;
public MapReduceThread(String link, HashMap<String, Integer> _words_count) {
this.link = link;
this._words_count = _words_count;
}
@Override
public void run() {
final HashMap<String, Integer> words_count = new HashMap<String, Integer>();
try {
CleanerProperties props = new CleanerProperties();
final HtmlCleaner htmlCleaner = new HtmlCleaner(props);
TagNode tagNode = htmlCleaner.clean(new java.net.URL(link));
Object[] myNodes = tagNode
.evaluateXPath("//div[@id='storybody']/p/text()");
for (int i = 0; i < myNodes.length; i++) {
String doc = String.valueOf(myNodes[i]);
String arr[] = doc.split("\\s+");
for (String key : arr) {
key = key.replace("'","").replace("\"", "");
Integer c = words_count.get(key);
if (null == c) {
words_count.put(key, 1);
} else {
words_count.put(key, c + 1);
}
}
}
synchronized (_words_count) {
for (Entry<String, Integer> e : words_count.entrySet()) {
String key = e.getKey();
Integer n = _words_count.get(key);
if (null == n) {
n = 0;
}
_words_count.put(key, n + e.getValue());
}
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment