Skip to content

Instantly share code, notes, and snippets.

@pxpc2
Created February 23, 2013 13:30
Show Gist options
  • Save pxpc2/5019743 to your computer and use it in GitHub Desktop.
Save pxpc2/5019743 to your computer and use it in GitHub Desktop.
package us.brtm.bot.loader.crawler;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import us.brtm.bot.loader.Settings;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Works for official Runescape client (7xx versions)
* and for Runescape 07' client
*
* @author pxpc2
*/
public final class WebCrawler implements Settings {
private static final String COMPLEX_PARAM = "<param name=\"([^\\\\s]+)\"\\\\s+value=\"([^>]*)\">";
private static final String BASIC_PARAM = "document\\.write\\('(.+?)'\\);";
private static HashMap<String, String> parameters = null;
private WebCrawler() {
}
private static void load() {
try {
parameters = crawl();
} catch (final Exception e) {
e.printStackTrace();
}
}
public static String getParameter(final String key) {
if (parameters == null) {
load();
}
return parameters.get(key);
}
private static HashMap<String, String> crawl() throws IOException {
final HashMap<String, String> parameters = new HashMap<>();
System.out.println("Crawling parameters");
final long startTime = System.currentTimeMillis();
final ByteArrayOutputStream out = new ByteArrayOutputStream();
final URLConnection con = getURL(new URL(RS_WORLD), RS_HOST, Channels.newChannel(out));
final String webpage = new String(out.toByteArray(), getTextCharset(con.getContentType()));
Matcher matcher = Pattern.compile(BASIC_PARAM).matcher(webpage);
final StringBuilder builder = new StringBuilder();
while (matcher.find()) {
builder.append(matcher.group(1));
}
final Document document = Jsoup.parseBodyFragment(builder.toString());
final Element element = document.select("applet").first();
// Had to do some hackery with jsoup lib to get this hashmap
final LinkedHashMap<String, Attribute> params = element.attributes.attributes;
for (final String key : params.keySet()) {
parameters.put(key, params.get(key).getValue());
}
matcher = Pattern.compile(COMPLEX_PARAM).matcher(webpage);
while (matcher.find()) {
parameters.put(matcher.group(1), matcher.group(2));
}
final long finishTime = System.currentTimeMillis();
System.out.printf("Crawled: " + parameters.size() + " parameters \t | \t\t");
System.out.println("Time taken: " + formatTime(finishTime - startTime) + "\n");
return parameters;
}
private static String getTextCharset(final String contentType) {
if (contentType.split(";")[0].trim().equals("text/html")) {
Matcher m = Pattern.compile("charset=([a-zA-Z0-9\\-]+)").matcher(contentType);
m.find();
return m.group(1);
}
throw new RuntimeException(contentType + " is not a text/html");
}
public static void channelCopy(final ReadableByteChannel in, final WritableByteChannel out) throws IOException {
final ByteBuffer buffer = ByteBuffer.allocateDirect(1024 * 4);
while (in.read(buffer) >= 0 || buffer.position() != 0) {
buffer.flip();
out.write(buffer);
buffer.compact();
}
out.close();
}
public static URLConnection getURL(final URL url, final String host, final WritableByteChannel out) throws IOException {
final URLConnection c = url.openConnection();
c.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*\\/*;q=0.8");
c.setRequestProperty("Accept-Encoding", "gzip, deflate");
c.setRequestProperty("Accept-Language", "en-gb,en;q=0.5");
c.setRequestProperty("Host", host);
c.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 5.1; rv:16.0) Gecko/20100101 Firefox/16.0");
channelCopy(Channels.newChannel(c.getInputStream()), out);
return c;
}
protected static String formatTime(final long time) {
final StringBuilder t = new StringBuilder();
final long total_secs = time / 1000;
final long total_mins = total_secs / 60;
final long total_hrs = total_mins / 60;
final int secs = (int) total_secs % 60;
final int mins = (int) total_mins % 60;
final int hrs = (int) total_hrs % 24;
if (hrs < 10) {
t.append("0");
}
t.append(hrs);
t.append(":");
if (mins < 10) {
t.append("0");
}
t.append(mins);
t.append(":");
if (secs < 10) {
t.append("0");
}
t.append(secs);
return t.toString();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment