4e6/Scraper.java

## Scraper.java
package hr.the4e6;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Scraper {

	/**
	 * Block tags.
	 */
	private static final String[] blockTags = { "html", "head", "body",
			"frameset", "script", "noscript", "style", "meta", "link", "title",
			"frame", "noframes", "section", "nav", "aside", "hgroup", "header",
			"footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol",
			"pre", "div", "blockquote", "hr", "address", "figure",
			"figcaption", "form", "fieldset", "ins", "del", "dl", "dt", "dd",
			"li", "table", "caption", "thead", "tfoot", "tbody", "colgroup",
			"col", "tr", "th", "td", "video", "audio", "canvas", "details",
			"menu", "plaintext" };

	/**
	 * Tags to ignore when parsing.
	 */
	private static final String[] ignoreTags = { "html", "head", "title",
			"style", "script" };

	private static final HashSet<String> blockTagsSet = new HashSet<String>(
			Arrays.asList(blockTags));

	private static final HashSet<String> ignoreTagsSet = new HashSet<String>(
			Arrays.asList(ignoreTags));

	// List of URLs to scrape (from command line arguments)
	private static LinkedList<URL> urls = new LinkedList<URL>();
	// List of worlds to search (from command line arguments)
	private static LinkedList<String> words = new LinkedList<String>();

	// Count number of provided word(s) occurrence on webpage(s). (-w)
	private static Boolean wFlag = false;
	// Count number of characters of each web page (-c)
	private static Boolean cFlag = false;
	// Extract sentences’ which contain given words (-e)
	private static Boolean eFlag = false;
	// Verbosity flag (-v)
	private static Boolean vFlag = false;

	// Pattern to extract tag
	private static final Pattern RX_TAG = Pattern.compile("<(\\w+).*?>");

	// Pattern to extract sentence from a block of text.
	private static final Pattern RX_SENTENCE = Pattern
			.compile("[A-Z].+?(\\.\\s|\\Z)");

	/**
	 * Main function
	 *
	 * @throws IOException
	 */
	public static void main(String[] args) throws IOException {
		// read command line arguments
		setParameters(args);

		for (URL url : urls) {
			LinkedList<String> out = new LinkedList<String>();

			out.add("");
			out.add("---------------------");
			out.add(url.toString());
			out.add("---------------------");

			// Connecting
			StringBuilder html = readUrl(url);

			// Scraping
			long startScraping = System.nanoTime();
			LinkedList<String> blocks = scrape(html);
			long stopScraping = System.nanoTime();

			// Processing
			long startProcessing = System.nanoTime();
			if (wFlag || eFlag) {
				for (String word : words) {
					out.addAll(countWord(word, blocks));
				}
			}

			if (cFlag) {
				int counter = 0;
				for (String block : blocks) {
					counter += block.length();
				}
				out.add("Characters: " + counter);
			}
			long stopProcessing = System.nanoTime();

			if (vFlag) {
				long scrapingTime = stopScraping - startScraping;
				long processingTime = stopProcessing - startProcessing;

				out.add("Scraping: " + formatTime(scrapingTime, 0));
				out.add("Processing: " + formatTime(processingTime, 0));
			}

			// printing results
			for (String l : out) {
				System.out.println(l);
			}
		}

	}

	/**
	 * Add prefix to given time.
	 * @return Formatted time with prefix.
	 */
	private static String formatTime(long t, int prefix) {
		String[] px = { "ns", "us", "ms", "s" };
		if (prefix == px.length - 1 || t / 1000 == 0)
			return t + px[prefix];
		else
			return formatTime(t / 1000, ++prefix);
	}

	/**
	 * Count word occurrence in collection (case sensitive) and prepare
	 * statement for printing. For example, will count 'one' in 'component'.
	 */
	private static LinkedList<String> countWord(String word, List<String> blocks) {
		String warningMsg = "[!] Cant extract exact sentences from blocks."
				+ " Poor text formatting.";
		LinkedList<String> wordBlocks = filter(word, blocks);
		LinkedList<String> wordSentences = new LinkedList<String>();
		int counter = 0;

		for (String block : wordBlocks) {
			counter += strCount(block, word, 0);
			LinkedList<String> sentences = split(block, RX_SENTENCE);
			wordSentences.addAll(filter(word, sentences));
		}

		String head = word + ": " + (wFlag ? counter : "");
		wordBlocks.add(warningMsg);

		LinkedList<String> out = new LinkedList<String>();
		out.addFirst(head);
		if (eFlag) {
			out.addAll(counter == wordSentences.size() ? wordSentences
					: wordBlocks);
			out.add("");
		}

		return out;
	}

	/**
	 * Count occurrences of string 's' in substring 'substr'.
	 */
	private static int strCount(String s, String subs, int c) {
		if (s.indexOf(subs) == -1)
			return c;
		else
			return strCount(s.substring(s.indexOf(subs) + 1), subs, ++c);
	}

	/**
	 * Get contents of given URL.
	 *
	 * @throws IOException
	 */
	private static StringBuilder readUrl(URL url) throws IOException {
		StringBuilder html = new StringBuilder();
		BufferedReader br = null;

		try {
			br = new BufferedReader(new InputStreamReader(url.openStream()));
			String line;
			while ((line = br.readLine()) != null) {
				html.append(line);
			}
		} catch (IOException e) {
			System.err.println("[!] Can't connect to " + url);
		} finally {
			if (br != null)
				br.close();
		}

		return html;
	}

	/**
	 * Extract sentences from HTML markup.
	 *
	 * @return List of sentences.
	 */
	private static LinkedList<String> scrape(StringBuilder in)
			throws IOException {
		LinkedList<Integer> index = new LinkedList<Integer>();
		LinkedList<String> blocks = new LinkedList<String>();

		Matcher tag = RX_TAG.matcher(in);
		while (tag.find()) {
			index.push(tag.start());
		}

		while (!index.isEmpty()) {
			Matcher startTags = RX_TAG.matcher(in);
			String startTag = null;
			int startTag_s = 0, startTag_e = 0, startTag_l = 0;
			if (startTags.find(index.pop())) {
				startTag = startTags.group(1);
				startTag_s = startTags.start();
				startTag_e = startTags.end();
				startTag_l = startTag_e - startTag_s;
			}

			String endTag = "</" + startTag + ">";
			Matcher endTags = Pattern.compile(endTag).matcher(in);
			int endTag_s = 0, endTag_e = 0;
			if (endTags.find(startTag_e)) {
				endTag_s = endTags.start();
				endTag_e = endTags.end();
				String sentence = in.substring(startTag_e, endTag_s).trim();

				if (ignoreTagsSet.contains(startTag.toLowerCase())) {
					in.delete(startTag_s, endTag_e);
				} else if (blockTagsSet.contains(startTag.toLowerCase())) {
					if (!sentence.isEmpty()) {
						blocks.add(sentence);
					}
					in.delete(startTag_s, endTag_e);
				} else {
					in.delete(startTag_s, startTag_e);
					in.delete(endTag_s - startTag_l, endTag_e - startTag_l);
				}
			} else {
				in.delete(startTag_s, startTag_e);
			}
		}

		return blocks;
	}

	/**
	 * Filter collection, remain elements which contain given word.
	 *
	 * @return Filtered collection.
	 */
	private static LinkedList<String> filter(String word,
			List<String> collection) {
		LinkedList<String> filteredCollection = new LinkedList<String>();

		for (String string : collection) {
			if (string.contains(word)) {
				filteredCollection.add(string);
			}
		}

		return filteredCollection;
	}

	/**
	 * Split string on substrings by given pattern.
	 */
	private static LinkedList<String> split(String block, Pattern pattern) {
		LinkedList<String> wordSentences = new LinkedList<String>();
		Matcher m = pattern.matcher(block);

		while (m.find()) {
			wordSentences.add(m.group());
		}

		return wordSentences;
	}

	/**
	 * Parse and setup command line arguments.
	 *
	 * @param args
	 *            - command line arguments
	 * @throws IOException
	 */
	private static void setParameters(String[] args) throws IOException {
		for (String arg : args) {
			if ("-w".equals(arg)) {
				wFlag = true;
			} else if ("-c".equals(arg)) {
				cFlag = true;
			} else if ("-e".equals(arg)) {
				eFlag = true;
			} else if ("-v".equals(arg)) {
				vFlag = true;
			} else if (urls.isEmpty() && arg.startsWith("http://")) {
				addUrl(arg);
			} else if (urls.isEmpty()) {
				BufferedReader br = null;
				try {
					br = new BufferedReader(new FileReader(arg));
					String url;
					while ((url = br.readLine()) != null) {
						addUrl(url);
					}
				} catch (FileNotFoundException e) {
					System.err.println("[!] File not found: " + arg);
					System.exit(0);
				} catch (IOException e) {
					System.err.println("[!] IO error while reading file: " + arg);
				} finally {
					if (br != null)
						br.close();
				}
			} else if (words.isEmpty()) {
				for (String word : arg.split(",")) {
					words.add(word);
				}
			} else {
				System.err.println("[!] Malformed parameter: " + arg);
				System.exit(0);
			}
		}

		if (urls.isEmpty()) {
			System.err.println("[!] You must provide at least one url to scrape!");
			System.exit(0);
		}

		if ((wFlag || eFlag) && words.isEmpty()) {
			System.err.println("[!] You must provide words to count or remove '"
					+ (wFlag ? "-w" : "-e") + "' parameter!");
			System.exit(0);
		}
	}

	private static void addUrl(String url) {
		try {
			urls.add(new URL(url));
		} catch (MalformedURLException e) {
			System.err.println("[!] Malformed URL: " + url);
		}
	}

}
	package hr.the4e6;

	import java.io.BufferedReader;
	import java.io.FileNotFoundException;
	import java.io.FileReader;
	import java.io.IOException;
	import java.io.InputStreamReader;
	import java.net.MalformedURLException;
	import java.net.URL;
	import java.util.Arrays;
	import java.util.HashSet;
	import java.util.LinkedList;
	import java.util.List;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	public class Scraper {

	/**
	* Block tags.
	*/
	private static final String[] blockTags = { "html", "head", "body",
	"frameset", "script", "noscript", "style", "meta", "link", "title",
	"frame", "noframes", "section", "nav", "aside", "hgroup", "header",
	"footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol",
	"pre", "div", "blockquote", "hr", "address", "figure",
	"figcaption", "form", "fieldset", "ins", "del", "dl", "dt", "dd",
	"li", "table", "caption", "thead", "tfoot", "tbody", "colgroup",
	"col", "tr", "th", "td", "video", "audio", "canvas", "details",
	"menu", "plaintext" };

	/**
	* Tags to ignore when parsing.
	*/
	private static final String[] ignoreTags = { "html", "head", "title",
	"style", "script" };

	private static final HashSet<String> blockTagsSet = new HashSet<String>(
	Arrays.asList(blockTags));

	private static final HashSet<String> ignoreTagsSet = new HashSet<String>(
	Arrays.asList(ignoreTags));

	// List of URLs to scrape (from command line arguments)
	private static LinkedList<URL> urls = new LinkedList<URL>();
	// List of worlds to search (from command line arguments)
	private static LinkedList<String> words = new LinkedList<String>();

	// Count number of provided word(s) occurrence on webpage(s). (-w)
	private static Boolean wFlag = false;
	// Count number of characters of each web page (-c)
	private static Boolean cFlag = false;
	// Extract sentences’ which contain given words (-e)
	private static Boolean eFlag = false;
	// Verbosity flag (-v)
	private static Boolean vFlag = false;

	// Pattern to extract tag
	private static final Pattern RX_TAG = Pattern.compile("<(\\w+).*?>");

	// Pattern to extract sentence from a block of text.
	private static final Pattern RX_SENTENCE = Pattern
	.compile("[A-Z].+?(\\.\\s\|\\Z)");

	/**
	* Main function
	*
	* @throws IOException
	*/
	public static void main(String[] args) throws IOException {
	// read command line arguments
	setParameters(args);

	for (URL url : urls) {
	LinkedList<String> out = new LinkedList<String>();

	out.add("");
	out.add("---------------------");
	out.add(url.toString());
	out.add("---------------------");

	// Connecting
	StringBuilder html = readUrl(url);

	// Scraping
	long startScraping = System.nanoTime();
	LinkedList<String> blocks = scrape(html);
	long stopScraping = System.nanoTime();

	// Processing
	long startProcessing = System.nanoTime();
	if (wFlag \|\| eFlag) {
	for (String word : words) {
	out.addAll(countWord(word, blocks));
	}
	}

	if (cFlag) {
	int counter = 0;
	for (String block : blocks) {
	counter += block.length();
	}
	out.add("Characters: " + counter);
	}
	long stopProcessing = System.nanoTime();

	if (vFlag) {
	long scrapingTime = stopScraping - startScraping;
	long processingTime = stopProcessing - startProcessing;

	out.add("Scraping: " + formatTime(scrapingTime, 0));
	out.add("Processing: " + formatTime(processingTime, 0));
	}

	// printing results
	for (String l : out) {
	System.out.println(l);
	}
	}

	}

	/**
	* Add prefix to given time.
	* @return Formatted time with prefix.
	*/
	private static String formatTime(long t, int prefix) {
	String[] px = { "ns", "us", "ms", "s" };
	if (prefix == px.length - 1 \|\| t / 1000 == 0)
	return t + px[prefix];
	else
	return formatTime(t / 1000, ++prefix);
	}

	/**
	* Count word occurrence in collection (case sensitive) and prepare
	* statement for printing. For example, will count 'one' in 'component'.
	*/
	private static LinkedList<String> countWord(String word, List<String> blocks) {
	String warningMsg = "[!] Cant extract exact sentences from blocks."
	+ " Poor text formatting.";
	LinkedList<String> wordBlocks = filter(word, blocks);
	LinkedList<String> wordSentences = new LinkedList<String>();
	int counter = 0;

	for (String block : wordBlocks) {
	counter += strCount(block, word, 0);
	LinkedList<String> sentences = split(block, RX_SENTENCE);
	wordSentences.addAll(filter(word, sentences));
	}

	String head = word + ": " + (wFlag ? counter : "");
	wordBlocks.add(warningMsg);

	LinkedList<String> out = new LinkedList<String>();
	out.addFirst(head);
	if (eFlag) {
	out.addAll(counter == wordSentences.size() ? wordSentences
	: wordBlocks);
	out.add("");
	}

	return out;
	}

	/**
	* Count occurrences of string 's' in substring 'substr'.
	*/
	private static int strCount(String s, String subs, int c) {
	if (s.indexOf(subs) == -1)
	return c;
	else
	return strCount(s.substring(s.indexOf(subs) + 1), subs, ++c);
	}

	/**
	* Get contents of given URL.
	*
	* @throws IOException
	*/
	private static StringBuilder readUrl(URL url) throws IOException {
	StringBuilder html = new StringBuilder();
	BufferedReader br = null;

	try {
	br = new BufferedReader(new InputStreamReader(url.openStream()));
	String line;
	while ((line = br.readLine()) != null) {
	html.append(line);
	}
	} catch (IOException e) {
	System.err.println("[!] Can't connect to " + url);
	} finally {
	if (br != null)
	br.close();
	}

	return html;
	}

	/**
	* Extract sentences from HTML markup.
	*
	* @return List of sentences.
	*/
	private static LinkedList<String> scrape(StringBuilder in)
	throws IOException {
	LinkedList<Integer> index = new LinkedList<Integer>();
	LinkedList<String> blocks = new LinkedList<String>();

	Matcher tag = RX_TAG.matcher(in);
	while (tag.find()) {
	index.push(tag.start());
	}

	while (!index.isEmpty()) {
	Matcher startTags = RX_TAG.matcher(in);
	String startTag = null;
	int startTag_s = 0, startTag_e = 0, startTag_l = 0;
	if (startTags.find(index.pop())) {
	startTag = startTags.group(1);
	startTag_s = startTags.start();
	startTag_e = startTags.end();
	startTag_l = startTag_e - startTag_s;
	}

	String endTag = "</" + startTag + ">";
	Matcher endTags = Pattern.compile(endTag).matcher(in);
	int endTag_s = 0, endTag_e = 0;
	if (endTags.find(startTag_e)) {
	endTag_s = endTags.start();
	endTag_e = endTags.end();
	String sentence = in.substring(startTag_e, endTag_s).trim();

	if (ignoreTagsSet.contains(startTag.toLowerCase())) {
	in.delete(startTag_s, endTag_e);
	} else if (blockTagsSet.contains(startTag.toLowerCase())) {
	if (!sentence.isEmpty()) {
	blocks.add(sentence);
	}
	in.delete(startTag_s, endTag_e);
	} else {
	in.delete(startTag_s, startTag_e);
	in.delete(endTag_s - startTag_l, endTag_e - startTag_l);
	}
	} else {
	in.delete(startTag_s, startTag_e);
	}
	}

	return blocks;
	}

	/**
	* Filter collection, remain elements which contain given word.
	*
	* @return Filtered collection.
	*/
	private static LinkedList<String> filter(String word,
	List<String> collection) {
	LinkedList<String> filteredCollection = new LinkedList<String>();

	for (String string : collection) {
	if (string.contains(word)) {
	filteredCollection.add(string);
	}
	}

	return filteredCollection;
	}

	/**
	* Split string on substrings by given pattern.
	*/
	private static LinkedList<String> split(String block, Pattern pattern) {
	LinkedList<String> wordSentences = new LinkedList<String>();
	Matcher m = pattern.matcher(block);

	while (m.find()) {
	wordSentences.add(m.group());
	}

	return wordSentences;
	}

	/**
	* Parse and setup command line arguments.
	*
	* @param args
	* - command line arguments
	* @throws IOException
	*/
	private static void setParameters(String[] args) throws IOException {
	for (String arg : args) {
	if ("-w".equals(arg)) {
	wFlag = true;
	} else if ("-c".equals(arg)) {
	cFlag = true;
	} else if ("-e".equals(arg)) {
	eFlag = true;
	} else if ("-v".equals(arg)) {
	vFlag = true;
	} else if (urls.isEmpty() && arg.startsWith("http://")) {
	addUrl(arg);
	} else if (urls.isEmpty()) {
	BufferedReader br = null;
	try {
	br = new BufferedReader(new FileReader(arg));
	String url;
	while ((url = br.readLine()) != null) {
	addUrl(url);
	}
	} catch (FileNotFoundException e) {
	System.err.println("[!] File not found: " + arg);
	System.exit(0);
	} catch (IOException e) {
	System.err.println("[!] IO error while reading file: " + arg);
	} finally {
	if (br != null)
	br.close();
	}
	} else if (words.isEmpty()) {
	for (String word : arg.split(",")) {
	words.add(word);
	}
	} else {
	System.err.println("[!] Malformed parameter: " + arg);
	System.exit(0);
	}
	}

	if (urls.isEmpty()) {
	System.err.println("[!] You must provide at least one url to scrape!");
	System.exit(0);
	}

	if ((wFlag \|\| eFlag) && words.isEmpty()) {
	System.err.println("[!] You must provide words to count or remove '"
	+ (wFlag ? "-w" : "-e") + "' parameter!");
	System.exit(0);
	}
	}

	private static void addUrl(String url) {
	try {
	urls.add(new URL(url));
	} catch (MalformedURLException e) {
	System.err.println("[!] Malformed URL: " + url);
	}
	}

	}