Created
February 23, 2013 13:30
-
-
Save pxpc2/5019743 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package us.brtm.bot.loader.crawler; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Attribute; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import us.brtm.bot.loader.Settings; | |
import java.io.ByteArrayOutputStream; | |
import java.io.IOException; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.nio.ByteBuffer; | |
import java.nio.channels.Channels; | |
import java.nio.channels.ReadableByteChannel; | |
import java.nio.channels.WritableByteChannel; | |
import java.util.HashMap; | |
import java.util.LinkedHashMap; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
/** | |
* Works for official Runescape client (7xx versions) | |
* and for Runescape 07' client | |
* | |
* @author pxpc2 | |
*/ | |
public final class WebCrawler implements Settings { | |
private static final String COMPLEX_PARAM = "<param name=\"([^\\\\s]+)\"\\\\s+value=\"([^>]*)\">"; | |
private static final String BASIC_PARAM = "document\\.write\\('(.+?)'\\);"; | |
private static HashMap<String, String> parameters = null; | |
private WebCrawler() { | |
} | |
private static void load() { | |
try { | |
parameters = crawl(); | |
} catch (final Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
public static String getParameter(final String key) { | |
if (parameters == null) { | |
load(); | |
} | |
return parameters.get(key); | |
} | |
private static HashMap<String, String> crawl() throws IOException { | |
final HashMap<String, String> parameters = new HashMap<>(); | |
System.out.println("Crawling parameters"); | |
final long startTime = System.currentTimeMillis(); | |
final ByteArrayOutputStream out = new ByteArrayOutputStream(); | |
final URLConnection con = getURL(new URL(RS_WORLD), RS_HOST, Channels.newChannel(out)); | |
final String webpage = new String(out.toByteArray(), getTextCharset(con.getContentType())); | |
Matcher matcher = Pattern.compile(BASIC_PARAM).matcher(webpage); | |
final StringBuilder builder = new StringBuilder(); | |
while (matcher.find()) { | |
builder.append(matcher.group(1)); | |
} | |
final Document document = Jsoup.parseBodyFragment(builder.toString()); | |
final Element element = document.select("applet").first(); | |
// Had to do some hackery with jsoup lib to get this hashmap | |
final LinkedHashMap<String, Attribute> params = element.attributes.attributes; | |
for (final String key : params.keySet()) { | |
parameters.put(key, params.get(key).getValue()); | |
} | |
matcher = Pattern.compile(COMPLEX_PARAM).matcher(webpage); | |
while (matcher.find()) { | |
parameters.put(matcher.group(1), matcher.group(2)); | |
} | |
final long finishTime = System.currentTimeMillis(); | |
System.out.printf("Crawled: " + parameters.size() + " parameters \t | \t\t"); | |
System.out.println("Time taken: " + formatTime(finishTime - startTime) + "\n"); | |
return parameters; | |
} | |
private static String getTextCharset(final String contentType) { | |
if (contentType.split(";")[0].trim().equals("text/html")) { | |
Matcher m = Pattern.compile("charset=([a-zA-Z0-9\\-]+)").matcher(contentType); | |
m.find(); | |
return m.group(1); | |
} | |
throw new RuntimeException(contentType + " is not a text/html"); | |
} | |
public static void channelCopy(final ReadableByteChannel in, final WritableByteChannel out) throws IOException { | |
final ByteBuffer buffer = ByteBuffer.allocateDirect(1024 * 4); | |
while (in.read(buffer) >= 0 || buffer.position() != 0) { | |
buffer.flip(); | |
out.write(buffer); | |
buffer.compact(); | |
} | |
out.close(); | |
} | |
public static URLConnection getURL(final URL url, final String host, final WritableByteChannel out) throws IOException { | |
final URLConnection c = url.openConnection(); | |
c.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*\\/*;q=0.8"); | |
c.setRequestProperty("Accept-Encoding", "gzip, deflate"); | |
c.setRequestProperty("Accept-Language", "en-gb,en;q=0.5"); | |
c.setRequestProperty("Host", host); | |
c.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 5.1; rv:16.0) Gecko/20100101 Firefox/16.0"); | |
channelCopy(Channels.newChannel(c.getInputStream()), out); | |
return c; | |
} | |
protected static String formatTime(final long time) { | |
final StringBuilder t = new StringBuilder(); | |
final long total_secs = time / 1000; | |
final long total_mins = total_secs / 60; | |
final long total_hrs = total_mins / 60; | |
final int secs = (int) total_secs % 60; | |
final int mins = (int) total_mins % 60; | |
final int hrs = (int) total_hrs % 24; | |
if (hrs < 10) { | |
t.append("0"); | |
} | |
t.append(hrs); | |
t.append(":"); | |
if (mins < 10) { | |
t.append("0"); | |
} | |
t.append(mins); | |
t.append(":"); | |
if (secs < 10) { | |
t.append("0"); | |
} | |
t.append(secs); | |
return t.toString(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment