Created
November 26, 2011 14:53
-
-
Save 4e6/1395803 to your computer and use it in GitHub Desktop.
Test task from HireRight
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package hr.the4e6; | |
import java.io.BufferedReader; | |
import java.io.FileNotFoundException; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.net.MalformedURLException; | |
import java.net.URL; | |
import java.util.Arrays; | |
import java.util.HashSet; | |
import java.util.LinkedList; | |
import java.util.List; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
public class Scraper { | |
/** | |
* Block tags. | |
*/ | |
private static final String[] blockTags = { "html", "head", "body", | |
"frameset", "script", "noscript", "style", "meta", "link", "title", | |
"frame", "noframes", "section", "nav", "aside", "hgroup", "header", | |
"footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", | |
"pre", "div", "blockquote", "hr", "address", "figure", | |
"figcaption", "form", "fieldset", "ins", "del", "dl", "dt", "dd", | |
"li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", | |
"col", "tr", "th", "td", "video", "audio", "canvas", "details", | |
"menu", "plaintext" }; | |
/** | |
* Tags to ignore when parsing. | |
*/ | |
private static final String[] ignoreTags = { "html", "head", "title", | |
"style", "script" }; | |
private static final HashSet<String> blockTagsSet = new HashSet<String>( | |
Arrays.asList(blockTags)); | |
private static final HashSet<String> ignoreTagsSet = new HashSet<String>( | |
Arrays.asList(ignoreTags)); | |
// List of URLs to scrape (from command line arguments) | |
private static LinkedList<URL> urls = new LinkedList<URL>(); | |
// List of worlds to search (from command line arguments) | |
private static LinkedList<String> words = new LinkedList<String>(); | |
// Count number of provided word(s) occurrence on webpage(s). (-w) | |
private static Boolean wFlag = false; | |
// Count number of characters of each web page (-c) | |
private static Boolean cFlag = false; | |
// Extract sentences’ which contain given words (-e) | |
private static Boolean eFlag = false; | |
// Verbosity flag (-v) | |
private static Boolean vFlag = false; | |
// Pattern to extract tag | |
private static final Pattern RX_TAG = Pattern.compile("<(\\w+).*?>"); | |
// Pattern to extract sentence from a block of text. | |
private static final Pattern RX_SENTENCE = Pattern | |
.compile("[A-Z].+?(\\.\\s|\\Z)"); | |
/** | |
* Main function | |
* | |
* @throws IOException | |
*/ | |
public static void main(String[] args) throws IOException { | |
// read command line arguments | |
setParameters(args); | |
for (URL url : urls) { | |
LinkedList<String> out = new LinkedList<String>(); | |
out.add(""); | |
out.add("---------------------"); | |
out.add(url.toString()); | |
out.add("---------------------"); | |
// Connecting | |
StringBuilder html = readUrl(url); | |
// Scraping | |
long startScraping = System.nanoTime(); | |
LinkedList<String> blocks = scrape(html); | |
long stopScraping = System.nanoTime(); | |
// Processing | |
long startProcessing = System.nanoTime(); | |
if (wFlag || eFlag) { | |
for (String word : words) { | |
out.addAll(countWord(word, blocks)); | |
} | |
} | |
if (cFlag) { | |
int counter = 0; | |
for (String block : blocks) { | |
counter += block.length(); | |
} | |
out.add("Characters: " + counter); | |
} | |
long stopProcessing = System.nanoTime(); | |
if (vFlag) { | |
long scrapingTime = stopScraping - startScraping; | |
long processingTime = stopProcessing - startProcessing; | |
out.add("Scraping: " + formatTime(scrapingTime, 0)); | |
out.add("Processing: " + formatTime(processingTime, 0)); | |
} | |
// printing results | |
for (String l : out) { | |
System.out.println(l); | |
} | |
} | |
} | |
/** | |
* Add prefix to given time. | |
* @return Formatted time with prefix. | |
*/ | |
private static String formatTime(long t, int prefix) { | |
String[] px = { "ns", "us", "ms", "s" }; | |
if (prefix == px.length - 1 || t / 1000 == 0) | |
return t + px[prefix]; | |
else | |
return formatTime(t / 1000, ++prefix); | |
} | |
/** | |
* Count word occurrence in collection (case sensitive) and prepare | |
* statement for printing. For example, will count 'one' in 'component'. | |
*/ | |
private static LinkedList<String> countWord(String word, List<String> blocks) { | |
String warningMsg = "[!] Cant extract exact sentences from blocks." | |
+ " Poor text formatting."; | |
LinkedList<String> wordBlocks = filter(word, blocks); | |
LinkedList<String> wordSentences = new LinkedList<String>(); | |
int counter = 0; | |
for (String block : wordBlocks) { | |
counter += strCount(block, word, 0); | |
LinkedList<String> sentences = split(block, RX_SENTENCE); | |
wordSentences.addAll(filter(word, sentences)); | |
} | |
String head = word + ": " + (wFlag ? counter : ""); | |
wordBlocks.add(warningMsg); | |
LinkedList<String> out = new LinkedList<String>(); | |
out.addFirst(head); | |
if (eFlag) { | |
out.addAll(counter == wordSentences.size() ? wordSentences | |
: wordBlocks); | |
out.add(""); | |
} | |
return out; | |
} | |
/** | |
* Count occurrences of string 's' in substring 'substr'. | |
*/ | |
private static int strCount(String s, String subs, int c) { | |
if (s.indexOf(subs) == -1) | |
return c; | |
else | |
return strCount(s.substring(s.indexOf(subs) + 1), subs, ++c); | |
} | |
/** | |
* Get contents of given URL. | |
* | |
* @throws IOException | |
*/ | |
private static StringBuilder readUrl(URL url) throws IOException { | |
StringBuilder html = new StringBuilder(); | |
BufferedReader br = null; | |
try { | |
br = new BufferedReader(new InputStreamReader(url.openStream())); | |
String line; | |
while ((line = br.readLine()) != null) { | |
html.append(line); | |
} | |
} catch (IOException e) { | |
System.err.println("[!] Can't connect to " + url); | |
} finally { | |
if (br != null) | |
br.close(); | |
} | |
return html; | |
} | |
/** | |
* Extract sentences from HTML markup. | |
* | |
* @return List of sentences. | |
*/ | |
private static LinkedList<String> scrape(StringBuilder in) | |
throws IOException { | |
LinkedList<Integer> index = new LinkedList<Integer>(); | |
LinkedList<String> blocks = new LinkedList<String>(); | |
Matcher tag = RX_TAG.matcher(in); | |
while (tag.find()) { | |
index.push(tag.start()); | |
} | |
while (!index.isEmpty()) { | |
Matcher startTags = RX_TAG.matcher(in); | |
String startTag = null; | |
int startTag_s = 0, startTag_e = 0, startTag_l = 0; | |
if (startTags.find(index.pop())) { | |
startTag = startTags.group(1); | |
startTag_s = startTags.start(); | |
startTag_e = startTags.end(); | |
startTag_l = startTag_e - startTag_s; | |
} | |
String endTag = "</" + startTag + ">"; | |
Matcher endTags = Pattern.compile(endTag).matcher(in); | |
int endTag_s = 0, endTag_e = 0; | |
if (endTags.find(startTag_e)) { | |
endTag_s = endTags.start(); | |
endTag_e = endTags.end(); | |
String sentence = in.substring(startTag_e, endTag_s).trim(); | |
if (ignoreTagsSet.contains(startTag.toLowerCase())) { | |
in.delete(startTag_s, endTag_e); | |
} else if (blockTagsSet.contains(startTag.toLowerCase())) { | |
if (!sentence.isEmpty()) { | |
blocks.add(sentence); | |
} | |
in.delete(startTag_s, endTag_e); | |
} else { | |
in.delete(startTag_s, startTag_e); | |
in.delete(endTag_s - startTag_l, endTag_e - startTag_l); | |
} | |
} else { | |
in.delete(startTag_s, startTag_e); | |
} | |
} | |
return blocks; | |
} | |
/** | |
* Filter collection, remain elements which contain given word. | |
* | |
* @return Filtered collection. | |
*/ | |
private static LinkedList<String> filter(String word, | |
List<String> collection) { | |
LinkedList<String> filteredCollection = new LinkedList<String>(); | |
for (String string : collection) { | |
if (string.contains(word)) { | |
filteredCollection.add(string); | |
} | |
} | |
return filteredCollection; | |
} | |
/** | |
* Split string on substrings by given pattern. | |
*/ | |
private static LinkedList<String> split(String block, Pattern pattern) { | |
LinkedList<String> wordSentences = new LinkedList<String>(); | |
Matcher m = pattern.matcher(block); | |
while (m.find()) { | |
wordSentences.add(m.group()); | |
} | |
return wordSentences; | |
} | |
/** | |
* Parse and setup command line arguments. | |
* | |
* @param args | |
* - command line arguments | |
* @throws IOException | |
*/ | |
private static void setParameters(String[] args) throws IOException { | |
for (String arg : args) { | |
if ("-w".equals(arg)) { | |
wFlag = true; | |
} else if ("-c".equals(arg)) { | |
cFlag = true; | |
} else if ("-e".equals(arg)) { | |
eFlag = true; | |
} else if ("-v".equals(arg)) { | |
vFlag = true; | |
} else if (urls.isEmpty() && arg.startsWith("http://")) { | |
addUrl(arg); | |
} else if (urls.isEmpty()) { | |
BufferedReader br = null; | |
try { | |
br = new BufferedReader(new FileReader(arg)); | |
String url; | |
while ((url = br.readLine()) != null) { | |
addUrl(url); | |
} | |
} catch (FileNotFoundException e) { | |
System.err.println("[!] File not found: " + arg); | |
System.exit(0); | |
} catch (IOException e) { | |
System.err.println("[!] IO error while reading file: " + arg); | |
} finally { | |
if (br != null) | |
br.close(); | |
} | |
} else if (words.isEmpty()) { | |
for (String word : arg.split(",")) { | |
words.add(word); | |
} | |
} else { | |
System.err.println("[!] Malformed parameter: " + arg); | |
System.exit(0); | |
} | |
} | |
if (urls.isEmpty()) { | |
System.err.println("[!] You must provide at least one url to scrape!"); | |
System.exit(0); | |
} | |
if ((wFlag || eFlag) && words.isEmpty()) { | |
System.err.println("[!] You must provide words to count or remove '" | |
+ (wFlag ? "-w" : "-e") + "' parameter!"); | |
System.exit(0); | |
} | |
} | |
private static void addUrl(String url) { | |
try { | |
urls.add(new URL(url)); | |
} catch (MalformedURLException e) { | |
System.err.println("[!] Malformed URL: " + url); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment