Skip to content

Instantly share code, notes, and snippets.

@4e6
Created November 26, 2011 14:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 4e6/1395803 to your computer and use it in GitHub Desktop.
Save 4e6/1395803 to your computer and use it in GitHub Desktop.
Test task from HireRight
package hr.the4e6;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Scraper {
/**
* Block tags.
*/
private static final String[] blockTags = { "html", "head", "body",
"frameset", "script", "noscript", "style", "meta", "link", "title",
"frame", "noframes", "section", "nav", "aside", "hgroup", "header",
"footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol",
"pre", "div", "blockquote", "hr", "address", "figure",
"figcaption", "form", "fieldset", "ins", "del", "dl", "dt", "dd",
"li", "table", "caption", "thead", "tfoot", "tbody", "colgroup",
"col", "tr", "th", "td", "video", "audio", "canvas", "details",
"menu", "plaintext" };
/**
* Tags to ignore when parsing.
*/
private static final String[] ignoreTags = { "html", "head", "title",
"style", "script" };
private static final HashSet<String> blockTagsSet = new HashSet<String>(
Arrays.asList(blockTags));
private static final HashSet<String> ignoreTagsSet = new HashSet<String>(
Arrays.asList(ignoreTags));
// List of URLs to scrape (from command line arguments)
private static LinkedList<URL> urls = new LinkedList<URL>();
// List of worlds to search (from command line arguments)
private static LinkedList<String> words = new LinkedList<String>();
// Count number of provided word(s) occurrence on webpage(s). (-w)
private static Boolean wFlag = false;
// Count number of characters of each web page (-c)
private static Boolean cFlag = false;
// Extract sentences’ which contain given words (-e)
private static Boolean eFlag = false;
// Verbosity flag (-v)
private static Boolean vFlag = false;
// Pattern to extract tag
private static final Pattern RX_TAG = Pattern.compile("<(\\w+).*?>");
// Pattern to extract sentence from a block of text.
private static final Pattern RX_SENTENCE = Pattern
.compile("[A-Z].+?(\\.\\s|\\Z)");
/**
* Main function
*
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// read command line arguments
setParameters(args);
for (URL url : urls) {
LinkedList<String> out = new LinkedList<String>();
out.add("");
out.add("---------------------");
out.add(url.toString());
out.add("---------------------");
// Connecting
StringBuilder html = readUrl(url);
// Scraping
long startScraping = System.nanoTime();
LinkedList<String> blocks = scrape(html);
long stopScraping = System.nanoTime();
// Processing
long startProcessing = System.nanoTime();
if (wFlag || eFlag) {
for (String word : words) {
out.addAll(countWord(word, blocks));
}
}
if (cFlag) {
int counter = 0;
for (String block : blocks) {
counter += block.length();
}
out.add("Characters: " + counter);
}
long stopProcessing = System.nanoTime();
if (vFlag) {
long scrapingTime = stopScraping - startScraping;
long processingTime = stopProcessing - startProcessing;
out.add("Scraping: " + formatTime(scrapingTime, 0));
out.add("Processing: " + formatTime(processingTime, 0));
}
// printing results
for (String l : out) {
System.out.println(l);
}
}
}
/**
* Add prefix to given time.
* @return Formatted time with prefix.
*/
private static String formatTime(long t, int prefix) {
String[] px = { "ns", "us", "ms", "s" };
if (prefix == px.length - 1 || t / 1000 == 0)
return t + px[prefix];
else
return formatTime(t / 1000, ++prefix);
}
/**
* Count word occurrence in collection (case sensitive) and prepare
* statement for printing. For example, will count 'one' in 'component'.
*/
private static LinkedList<String> countWord(String word, List<String> blocks) {
String warningMsg = "[!] Cant extract exact sentences from blocks."
+ " Poor text formatting.";
LinkedList<String> wordBlocks = filter(word, blocks);
LinkedList<String> wordSentences = new LinkedList<String>();
int counter = 0;
for (String block : wordBlocks) {
counter += strCount(block, word, 0);
LinkedList<String> sentences = split(block, RX_SENTENCE);
wordSentences.addAll(filter(word, sentences));
}
String head = word + ": " + (wFlag ? counter : "");
wordBlocks.add(warningMsg);
LinkedList<String> out = new LinkedList<String>();
out.addFirst(head);
if (eFlag) {
out.addAll(counter == wordSentences.size() ? wordSentences
: wordBlocks);
out.add("");
}
return out;
}
/**
* Count occurrences of string 's' in substring 'substr'.
*/
private static int strCount(String s, String subs, int c) {
if (s.indexOf(subs) == -1)
return c;
else
return strCount(s.substring(s.indexOf(subs) + 1), subs, ++c);
}
/**
* Get contents of given URL.
*
* @throws IOException
*/
private static StringBuilder readUrl(URL url) throws IOException {
StringBuilder html = new StringBuilder();
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(url.openStream()));
String line;
while ((line = br.readLine()) != null) {
html.append(line);
}
} catch (IOException e) {
System.err.println("[!] Can't connect to " + url);
} finally {
if (br != null)
br.close();
}
return html;
}
/**
* Extract sentences from HTML markup.
*
* @return List of sentences.
*/
private static LinkedList<String> scrape(StringBuilder in)
throws IOException {
LinkedList<Integer> index = new LinkedList<Integer>();
LinkedList<String> blocks = new LinkedList<String>();
Matcher tag = RX_TAG.matcher(in);
while (tag.find()) {
index.push(tag.start());
}
while (!index.isEmpty()) {
Matcher startTags = RX_TAG.matcher(in);
String startTag = null;
int startTag_s = 0, startTag_e = 0, startTag_l = 0;
if (startTags.find(index.pop())) {
startTag = startTags.group(1);
startTag_s = startTags.start();
startTag_e = startTags.end();
startTag_l = startTag_e - startTag_s;
}
String endTag = "</" + startTag + ">";
Matcher endTags = Pattern.compile(endTag).matcher(in);
int endTag_s = 0, endTag_e = 0;
if (endTags.find(startTag_e)) {
endTag_s = endTags.start();
endTag_e = endTags.end();
String sentence = in.substring(startTag_e, endTag_s).trim();
if (ignoreTagsSet.contains(startTag.toLowerCase())) {
in.delete(startTag_s, endTag_e);
} else if (blockTagsSet.contains(startTag.toLowerCase())) {
if (!sentence.isEmpty()) {
blocks.add(sentence);
}
in.delete(startTag_s, endTag_e);
} else {
in.delete(startTag_s, startTag_e);
in.delete(endTag_s - startTag_l, endTag_e - startTag_l);
}
} else {
in.delete(startTag_s, startTag_e);
}
}
return blocks;
}
/**
* Filter collection, remain elements which contain given word.
*
* @return Filtered collection.
*/
private static LinkedList<String> filter(String word,
List<String> collection) {
LinkedList<String> filteredCollection = new LinkedList<String>();
for (String string : collection) {
if (string.contains(word)) {
filteredCollection.add(string);
}
}
return filteredCollection;
}
/**
* Split string on substrings by given pattern.
*/
private static LinkedList<String> split(String block, Pattern pattern) {
LinkedList<String> wordSentences = new LinkedList<String>();
Matcher m = pattern.matcher(block);
while (m.find()) {
wordSentences.add(m.group());
}
return wordSentences;
}
/**
* Parse and setup command line arguments.
*
* @param args
* - command line arguments
* @throws IOException
*/
private static void setParameters(String[] args) throws IOException {
for (String arg : args) {
if ("-w".equals(arg)) {
wFlag = true;
} else if ("-c".equals(arg)) {
cFlag = true;
} else if ("-e".equals(arg)) {
eFlag = true;
} else if ("-v".equals(arg)) {
vFlag = true;
} else if (urls.isEmpty() && arg.startsWith("http://")) {
addUrl(arg);
} else if (urls.isEmpty()) {
BufferedReader br = null;
try {
br = new BufferedReader(new FileReader(arg));
String url;
while ((url = br.readLine()) != null) {
addUrl(url);
}
} catch (FileNotFoundException e) {
System.err.println("[!] File not found: " + arg);
System.exit(0);
} catch (IOException e) {
System.err.println("[!] IO error while reading file: " + arg);
} finally {
if (br != null)
br.close();
}
} else if (words.isEmpty()) {
for (String word : arg.split(",")) {
words.add(word);
}
} else {
System.err.println("[!] Malformed parameter: " + arg);
System.exit(0);
}
}
if (urls.isEmpty()) {
System.err.println("[!] You must provide at least one url to scrape!");
System.exit(0);
}
if ((wFlag || eFlag) && words.isEmpty()) {
System.err.println("[!] You must provide words to count or remove '"
+ (wFlag ? "-w" : "-e") + "' parameter!");
System.exit(0);
}
}
private static void addUrl(String url) {
try {
urls.add(new URL(url));
} catch (MalformedURLException e) {
System.err.println("[!] Malformed URL: " + url);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment