-
-
Save GregHib/5be66f39c2f90a761618535a44a85d46 to your computer and use it in GitHub Desktop.
Website Archiver
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import world.gregs.voidps.tools.UrlHandler.convertQuery | |
import world.gregs.voidps.tools.UrlHandler.offset | |
import world.gregs.voidps.tools.UrlHandler.removeDomain | |
import world.gregs.voidps.tools.UrlHandler.removePrefixDomain | |
import world.gregs.voidps.tools.UrlHandler.removeSuffixDomain | |
import world.gregs.voidps.tools.UrlHandler.trimAnchor | |
import world.gregs.voidps.tools.UrlHandler.trimQuery | |
import kotlinx.coroutines.Dispatchers | |
import kotlinx.coroutines.GlobalScope | |
import kotlinx.coroutines.launch | |
import org.jsoup.Jsoup | |
import org.jsoup.nodes.Document | |
import java.io.File | |
import java.net.HttpURLConnection | |
import java.net.URL | |
import java.util.concurrent.ConcurrentHashMap | |
import java.util.concurrent.ConcurrentLinkedQueue | |
class SiteMirror( | |
date: String, | |
private val languages: Boolean, | |
private val knowledgeBase: Boolean, | |
private val downloads: Boolean, | |
private val singlePage: Boolean, | |
) { | |
private var year = date.take(4).toInt() | |
private val output = File("./$year-${date.substring(4, 6)}-${date.substring(6, 8)}/") | |
private val all: MutableSet<String> = ConcurrentHashMap.newKeySet() | |
private val queue = ConcurrentLinkedQueue<Pair<String, String>>() | |
private val validUrlRegex = "https?:\\\\?/\\\\?/wayback\\.archive-it\\.org\\\\?/all\\\\?/.*?\\\\?/https?:\\\\?/\\\\?/(?:[a-zA-Z0-9-.]+?)?(?:runescape|jagex).com".toRegex() | |
private val testRegex = "(?:https?:)?\\\\?/\\\\?/[-a-zA-Z0-9+&@#\\\\/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]".toRegex() | |
private val staticRegex = "\"(/wb-static/[-a-zA-Z0-9+&@#\\\\/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|])\"".toRegex() | |
init { | |
queue("http://wayback.archive-it.org/all/$date/http://www.runescape.com/", force = true) | |
} | |
fun next() { | |
val (req, path) = queue.poll() ?: return | |
GlobalScope.launch(Dispatchers.Default) { | |
grabPage(req, path) | |
} | |
} | |
private fun shouldSkip(path: String): Boolean { | |
val dateIndex = path.indexOf("all/") | |
val date = path.substring(dateIndex + 4, dateIndex + 8).toIntOrNull() | |
if (date != null && date < year) { | |
return true | |
} | |
if (singlePage && !supportedFileDownloads(trimQuery(trimAnchor(path)))) { | |
return true | |
} | |
if (!languages && (path.contains("l=") || path.contains("set_lang="))) { | |
return true | |
} | |
if (!knowledgeBase && path.contains("kbase")) { | |
return true | |
} | |
if (!downloads && path.contains("downloads_and_wallpapers")) { | |
return true | |
} | |
return false | |
} | |
private fun queue(archived: String, force: Boolean = false) { | |
if (!force && shouldSkip(archived)) { | |
return | |
} | |
val path = getPath(archived) ?: return | |
if (!all.contains(path)) { | |
queue.add(archived to path) | |
all.add(path) | |
} | |
} | |
private fun getPath(source: String): String? { | |
return if (validUrlRegex.containsMatchIn(source)) { | |
val anchorIndex = source.indexOf("#") | |
val anchor = if (anchorIndex >= 0) { | |
source.substring(anchorIndex, source.length) | |
} else "" | |
var path = convertQuery(removeDomain(removePrefixDomain(source.replace(anchor, "").replace(".ws", ".html")), "runescape.com")) | |
when { | |
path.isBlank() || path == "runescape.com" -> path = "index.html" | |
path.endsWith("/") -> path += "index.html" | |
!path.endsWith(".html") && !supportedFileDownloads(path) -> { | |
path += ".html" | |
} | |
} | |
path | |
} else if (!source.contains("http") && !supportedFileDownloads(source)) { | |
convertQuery(source) | |
} else if (source.contains("/wb-static")) { | |
source.substring(source.indexOf("/wb-static") + 1, source.length) | |
} else if (source.contains("partner.archive-it.org/static/")) { | |
source.substring(source.indexOf("/static/") + 1, source.length) | |
} else { | |
null | |
} | |
} | |
fun removeDisclaimer(document: Document) { | |
for (element in document.select("style").reversed()) { | |
if (element.data().contains("disclaim")) { | |
element.remove() | |
} | |
} | |
for (element in document.select("wb_div").reversed()) { | |
element.remove() | |
} | |
for (element in document.select("script").reversed()) { | |
val data = element.data() | |
// if (data.contains("wombat") || data.contains("disclaim") ||data.contains("ait", true) || data.contains("TimeShift")) { | |
// element.remove() | |
// } | |
// val src = element.attr("src") | |
// if (src.contains("ait", true)) { | |
// element.remove() | |
// } | |
} | |
} | |
private fun grabPage(source: String, path: String) { | |
val connection = URL(source).openConnection() as HttpURLConnection | |
if (connection.responseCode != 200) { | |
return | |
} | |
println("Grab $source $path ${isTextFormat(path)}") | |
if (isTextFormat(path)) { | |
var data = connection.inputStream.readBytes().toString(Charsets.UTF_8) | |
val prefix = removeSuffixDomain(source) | |
for (match in testRegex.findAll(data).toList().reversed()) { | |
var original = standardise(match.groupValues.last()) | |
var url = original | |
if (url.contains("archive-it.org")) { | |
url = removePrefixDomain(url) | |
} | |
if (url.contains("runescape.com") || url.contains("jagex.com")) { | |
if (!original.contains("archive-it.org") && prefix.contains("archive-it.org")) { | |
original = "$prefix$original" | |
} | |
val other = getPath(original) ?: continue | |
url = offset(other, path.count { it == '/' }) | |
queue(original) | |
} | |
if (url != original) { | |
data = data.replaceRange(match.range, url) | |
} | |
} | |
for (match in staticRegex.findAll(data).toList().reversed()) { | |
val original = standardise(match.groupValues.last()) | |
queue.add("http://wayback.archive-it.org$original" to original) | |
all.add(original) | |
data = data.replaceRange(match.range, offset(getPath(original) ?: continue, path.count { it == '/' })) | |
} | |
if (data.contains(AIT)) { | |
getPath(AIT)?.let { url -> | |
data = data.replace(AIT, offset(url, path.count { it == '/' })) | |
queue("https:$AIT") | |
} | |
} | |
val out = File(output, trimAnchor(path)) | |
out.parentFile.mkdirs() | |
if (out.extension == "html") { | |
val document = Jsoup.parse(data.replace("charset=iso-8859-1", "charset=utf-8")) | |
removeDisclaimer(document) | |
if (!data.contains("charset=")) { | |
document.head().appendElement("meta").attr("http-equiv", "Content-Type").attr("content", "text/html;charset=utf-8") | |
} | |
out.writeText(document.toString(), Charsets.UTF_8) | |
} else { | |
out.writeText(data) | |
} | |
} else { | |
val out = File(output, trimAnchor(path)) | |
if (!out.exists()) { | |
out.parentFile.mkdirs() | |
out.writeBytes(connection.inputStream.readBytes()) | |
} | |
} | |
} | |
private fun standardise(url: String): String { | |
var url = url | |
if (url.startsWith("//")) { | |
url = "https:$url" | |
} | |
return url.replace("\\/", "/") | |
} | |
companion object { | |
const val AIT = "//partner.archive-it.org/static/AIT_Analytics.js" | |
fun isTextFormat(url: String): Boolean = url.endsWith(".ws", true) || url.endsWith(".html", true) || url.endsWith(".css", true) || url.endsWith(".js", true) | |
fun supportedFileDownloads(file: String): Boolean { | |
return file.endsWith(".exe", true) | |
|| file.endsWith(".msi", true) | |
|| file.endsWith(".mp3", true) | |
|| file.endsWith(".gif", true) | |
|| file.endsWith(".jpg", true) | |
|| file.endsWith(".png", true) | |
|| file.endsWith(".bz2", true) | |
|| file.endsWith(".zip", true) | |
|| file.endsWith(".tar", true) | |
|| file.endsWith(".jar", true) | |
|| file.endsWith(".ico", true) | |
|| file.endsWith(".rss", true) | |
|| file.endsWith(".css", true) | |
|| file.endsWith(".js", true) | |
|| file.endsWith(".json", true) | |
|| file.endsWith(".svg", true) | |
|| file.endsWith(".dmg", true) | |
|| file.endsWith(".woff", true) | |
|| file.endsWith(".woff2", true) | |
|| file.endsWith(".ttf", true) | |
|| file.endsWith(".eot", true) | |
|| file.endsWith(".webp", true) | |
|| file.endsWith(".webm", true) | |
|| file.endsWith(".mp4", true) | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import world.gregs.voidps.tools.SiteMirror.Companion.isTextFormat | |
import world.gregs.voidps.tools.SiteMirror.Companion.supportedFileDownloads | |
import world.gregs.voidps.tools.UrlHandler.convertQuery | |
import world.gregs.voidps.tools.UrlHandler.removeDomain | |
import world.gregs.voidps.tools.UrlHandler.trimAnchor | |
import world.gregs.voidps.tools.UrlHandler.trimQuery | |
import kotlinx.coroutines.Dispatchers | |
import kotlinx.coroutines.GlobalScope | |
import kotlinx.coroutines.launch | |
import java.io.File | |
import java.io.InputStream | |
import java.net.ConnectException | |
import java.net.HttpURLConnection | |
import java.net.URL | |
import java.net.URLDecoder | |
import java.util.concurrent.ConcurrentHashMap | |
import java.util.concurrent.ConcurrentLinkedQueue | |
import java.util.zip.GZIPInputStream | |
private val testRegex = "(?:https?:)?\\\\?/\\\\?/[-a-zA-Z0-9+&@#\\\\/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]".toRegex() | |
private val srcRegex = "(src|href)=\"(/.*?)\"".toRegex() | |
private val noScriptRegex = "src=\"data:image/gif;base64,R0lGODlhAQABAIAAAAAAAPhttps:///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7\".*?<noscript><img.*?src=\"(.*?)\"".toRegex() | |
private val urlRegex = "(url\\(\"?'?)(.*?)'?\"?\\)".toRegex() | |
private val interchangeRegex = "\\[(/.*?), ([a-z]+)]".toRegex() | |
private val output = File("./live/") | |
fun main() { | |
val scrapper = SiteMirrorLive() | |
scrapper.queue("https://runescape.com/", force = true) | |
scrapper.queue("https://play.runescape.com/", force = true) | |
scrapper.queue("https://play.runescape.com/runescape", force = true) | |
scrapper.queue("https://play.runescape.com/oldschool", force = true) | |
scrapper.queue("https://play.runescape.com/returning-players", force = true) | |
scrapper.queue("https://rs.runescape.com/premier-club", force = true) | |
scrapper.queue("https://secure.runescape.com/m=forum/forums", force = true)// Forums | |
scrapper.queue("https://secure.runescape.com/m=forum/forums?294,295,thd,391,66049913", force = true)// Forum category | |
scrapper.queue("https://secure.runescape.com/m=forum/forums?294,295,396,66126612", force = true)// Forum thread | |
val user = "2+Taint3d" | |
scrapper.queue("https://apps.runescape.com/runemetrics/app/welcome", force = true)// Runemetrics | |
scrapper.queue("https://apps.runescape.com/runemetrics/app/overview/player/$user", force = true)// Runemetrics user | |
scrapper.queue("https://apps.runescape.com/runemetrics/app/levels/player/$user", force = true)// Runemetrics levels | |
scrapper.queue("https://apps.runescape.com/runemetrics/app/xp-monthly/player/$user/-1", force = true)// Runemetrics xp | |
scrapper.queue("https://apps.runescape.com/runemetrics/app/xp-monthly/player/$user/-1", force = true)// Runemetrics xp | |
scrapper.queue("https://apps.runescape.com/runemetrics/app/activities/player/$user", force = true)// Runemetrics event log | |
scrapper.queue("https://apps.runescape.com/runemetrics/app/quests/player/$user", force = true)// Runemetrics quests | |
scrapper.queue("https://secure.runescape.com/m=itemdb_rs/results", force = true)// GE search | |
scrapper.queue("https://secure.runescape.com/m=itemdb_rs/", force = true)// GE | |
scrapper.queue("https://secure.runescape.com/m=itemdb_rs/catalogue", force = true) // GE catalogue | |
scrapper.queue("https://secure.runescape.com/m=itemdb_rs/catalogue?cat=32", force = true)// GE catalogue | |
scrapper.queue("https://secure.runescape.com/m=itemdb_rs/top100?list=2", force = true)// GE price rises | |
scrapper.queue("https://secure.runescape.com/m=itemdb_rs/Santa+hat/viewitem?obj=1050", force = true)// GE item | |
scrapper.queue("https://secure.runescape.com/m=hiscore/", force = true) // Hiscores | |
scrapper.queue("https://secure.runescape.com/m=hiscore/ranking", force = true) // Hiscores | |
scrapper.queue("https://secure.runescape.com/m=hiscore/ranking?category_type=1&table=26", force = true)// Achievements | |
scrapper.queue("http://services.runescape.com/m=clan-hiscores/ranking", force = true) // Clans | |
scrapper.queue("http://services.runescape.com/m=clan-hiscores/landing.ws", force = true) // Clans | |
scrapper.queue("http://services.runescape.com/m=clan-home/clan/Efficiency+Experts", force = true) // Clan info | |
scrapper.queue("http://services.runescape.com/m=clan-hiscores/compare.ws?clanName=Efficiency+Experts", force = true) // Clan stats | |
scrapper.queue("http://services.runescape.com/m=clan-hiscores/members.ws?clanName=Efficiency+Experts", force = true) // Clan mates | |
scrapper.queue("http://services.runescape.com/m=temp-hiscores/", force = true)// Seasonal | |
scrapper.queue("http://services.runescape.com/m=temp-hiscores/ranking?id=1508716800045&filter=-1&page=1", force = true)// Seasonal topic | |
scrapper.queue("http://services.runescape.com/m=temp-hiscores/compare?user1=Sina", force = true)// Seasonal player | |
scrapper.queue("https://secure.runescape.com/m=hiscore/ranking?category_type=1", force = true)// Activities | |
scrapper.queue("https://secure.runescape.com/m=hiscore/compare?user1=Blacked+Out&category_type=1", force = true)// Player activities | |
scrapper.queue("https://secure.runescape.com/m=hiscore/ranking?category_type=0&table=0&time_filter=1&page=1", force = true) | |
scrapper.queue("https://services.runescape.com/m=hiscore/compare?user1=le+me", force = true) // Player | |
scrapper.queue("https://secure.runescape.com/m=hiscore/compare", force = true) // Player | |
scrapper.queue("https://oldschool.runescape.com/premier-club/", force = true) | |
scrapper.queue("https://secure.runescape.com/m=poll/", force = true) // Player power | |
scrapper.queue("https://secure.runescape.com/m=poll/archive?id=1596", force = true) // Poll | |
scrapper.queue("https://support.runescape.com/hc/en-gb", force = true) // Support | |
scrapper.queue("https://support.runescape.com/hc/en-gb/categories/200977391-Your-account", force = true) // Category | |
scrapper.queue("https://support.runescape.com/hc/en-gb/articles/360001313349-Go-here-to-install", force = true) // Post | |
scrapper.queue("https://secure.runescape.com/m=news/", force = true) // News | |
scrapper.queue("https://secure.runescape.com/m=news/archive?oldschool=1", force = true) // News | |
scrapper.queue("https://secure.runescape.com/m=news/shooting-stars-poll-blog?oldschool=1", force = true) // osrs news post | |
scrapper.queue("https://secure.runescape.com/m=news/8th-birthday-event-and-lms-changes?oldschool=1", force = true) // osrs news post | |
scrapper.queue("https://secure.runescape.com/m=news/the-isle-of-souls?oldschool=1", force = true) // osrs news post | |
scrapper.queue("https://secure.runescape.com/m=news/equipment-rebalancing-updated?oldschool=1", force = true) // osrs news post | |
scrapper.queue("https://secure.runescape.com/m=news/equipment-rebalance-postponed?oldschool=1", force = true) // osrs news post | |
scrapper.queue("https://secure.runescape.com/m=news/below-ice-mountain-poll-blog?oldschool=1", force = true) // osrs news post | |
scrapper.queue("https://secure.runescape.com/m=itemdb_oldschool/", force = true)// GE | |
scrapper.queue("https://secure.runescape.com/m=itemdb_oldschool/top100?list=1&scale=0", force = true)// GE most traded | |
scrapper.queue("https://secure.runescape.com/m=itemdb_oldschool/Santa+hat/viewitem?obj=1050", force = true)// GE item | |
scrapper.queue("https://secure.runescape.com/m=poll/oldschool/", force = true)// poll | |
scrapper.queue("https://secure.runescape.com/m=poll/oldschool/results?id=1616", force = true)// poll results | |
scrapper.queue("https://secure.runescape.com/m=hiscore_oldschool/overall", force = true)// hiscores | |
scrapper.queue("https://secure.runescape.com/m=hiscore_oldschool/hiscorepersonal?user1=Lynx Titan", force = true)// hiscore user | |
while (true) { | |
scrapper.next() | |
} | |
} | |
class SiteMirrorLive { | |
private val all: MutableSet<String> = ConcurrentHashMap.newKeySet() | |
private val queue = ConcurrentLinkedQueue<Pair<String, String>>() | |
private val validUrlRegex = "https?:\\\\?/\\\\?/(?:[a-zA-Z0-9-.]+?)?(?:runescape.com|jagex.com|ctfassets.net|zdassets.com)".toRegex() | |
private val singlePage = false | |
fun shouldSkip(path: String): Boolean { | |
if (singlePage && isTextFormat(trimQuery(trimAnchor(path)))) { | |
return true | |
} | |
if ((!path.contains("url=") && path.contains("l=")) || path.contains("set_lang=") || path.contains("de-DE") || path.contains("fr-FR") || path.contains("pt-BR") || path.contains("zh-CN") || path.contains("ja-JP") || path.contains("ko-KR") || path.contains("pl-PL")) { | |
return true | |
} | |
if (path.contains("Incapsula")) { | |
return true | |
} | |
if (path.contains("m=forums")) { | |
return true | |
} | |
if (path.contains("m=forum")) { | |
return true | |
} | |
if (path.contains("m=hiscore")) { | |
return true | |
} | |
if (path.contains("m=hiscore_oldschool")) { | |
return true | |
} | |
if (path.contains("/runemetrics/")) { | |
return true | |
} | |
if (path.contains("m=itemdb_rs")) { | |
return true | |
} | |
if (path.contains("m=itemdb_oldschool")) { | |
return true | |
} | |
if (path.contains("m=clan-hiscores")) { | |
return true | |
} | |
if (path.contains("m=temp-hiscores")) { | |
return true | |
} | |
if (path.contains("m=poll")) { | |
return true | |
} | |
if (path.contains("support.runescape.com")) { | |
return true | |
} | |
if (path.contains("?world=")) { | |
return true | |
} | |
return false | |
} | |
private val regex = "(/[^./]+?/\\.\\.)/".toRegex() | |
fun queue(archived: String, force: Boolean = false) { | |
val archived = URLDecoder.decode(archived, Charsets.UTF_8) | |
.replace(regex, "/") | |
.replace(" ", "+") | |
if (!force && shouldSkip(archived)) { | |
return | |
} | |
val path = getPath(archived) ?: return | |
if (!all.contains(path)) { | |
queue.add(archived to path) | |
all.add(path) | |
} | |
} | |
fun next() { | |
val (req, path) = queue.poll() ?: return | |
GlobalScope.launch(Dispatchers.Default) { | |
grabPage(req, path) | |
} | |
} | |
private fun getPath(source: String): String? { | |
if (validUrlRegex.containsMatchIn(source)) { | |
var path: String = convertQuery(removeDomain(source | |
.replace("/#/", "/") | |
.replace(".ws", ".html") | |
.replace(" ", "+"), | |
when { | |
source.contains("static.zdassets.com") -> "static.zdassets.com" | |
source.contains("ctfassets.net") -> "ctfassets.net" | |
source.contains("jagex.com") -> "jagex.com" | |
source.contains("rs.runescape.com") -> "runescape.com" | |
else -> "runescape.com" | |
}) | |
).replace(":", "-") | |
when { | |
path.isBlank() || path == "runescape.com" -> path = "index.html" | |
path.endsWith("/") -> path += "index.html" | |
!isTextFormat(path) && !supportedFileDownloads(trimAnchor(path)) -> { | |
path += "/index.html" | |
} | |
} | |
return path | |
} | |
return null | |
} | |
private fun grabPage(source: String, path: String) { | |
println("Grab $source $path") | |
if (isTextFormat(trimAnchor(path))) { | |
var data = getStream(source) { queue(it, true) }?.readBytes()?.toString(Charsets.UTF_8) ?: return | |
data = data | |
.replace("amp;", "") | |
.replace("&", "&") | |
.replace("%2F", "/") | |
val out = File(output, trimAnchor(path)) | |
if (trimAnchor(path).endsWith("css")) { | |
for (match in urlRegex.findAll(data).toList().reversed()) { | |
val type = match.groupValues[match.groupValues.lastIndex - 1] | |
var url = match.groupValues.last() | |
if (url.startsWith("data:")) { | |
continue | |
} | |
if (!url.startsWith("/") && !url.startsWith("http")) { | |
url = "/$url" | |
} | |
if (url.startsWith("/")) { | |
url = "${source.substring(0, source.indexOf("runescape.com") + 13)}$url" | |
} | |
if (url.contains("runescape.com") || url.contains("jagex.com") || url.contains("static.zdassets.com") || url.contains("ctfassets.net")) { | |
val other = getPath(trimQuery(trimAnchor(url))) ?: continue | |
queue(trimQuery(trimAnchor(url))) | |
data = data.replaceRange(match.range, "$type${UrlHandler.offset(other, path.count { it == '/' })}${if (type.last() == '\'') "'" else if (type.last() == '"') "\"" else ""})") | |
} | |
} | |
} else { | |
for (match in testRegex.findAll(data).toList().reversed()) { | |
val original = match.groupValues.last() | |
var url = original | |
if (url.startsWith("data:")) { | |
continue | |
} | |
if (url.startsWith("//")) { | |
url = "https:$url" | |
} | |
if (url.contains("runescape.com") || url.contains("jagex.com") || url.contains("static.zdassets.com") || url.contains("ctfassets.net")) { | |
val other = getPath(original) ?: continue | |
url = UrlHandler.offset(other, path.count { it == '/' }) | |
queue(original) | |
} | |
if (url != original) { | |
data = data.replaceRange(match.range, url) | |
} | |
} | |
for (match in interchangeRegex.findAll(data).toList().reversed()) { | |
var url = match.groupValues[match.groupValues.lastIndex - 1] | |
val size = match.groupValues.last() | |
if (url.startsWith("data:")) { | |
continue | |
} | |
if (url.startsWith("/")) { | |
val host = source.substring(0, source.indexOf("runescape.com") + 13) | |
url = "${host}${if (host.endsWith("/") || url.startsWith("/")) "" else "/"}$url" | |
} | |
if (url.contains("runescape.com") || url.contains("jagex.com") || url.contains("ctfassets.net")) { | |
val other = getPath(url) ?: continue | |
queue(url) | |
data = data.replaceRange(match.range, "[${UrlHandler.offset(other, path.count { it == '/' })}, $size]") | |
} | |
} | |
for (match in srcRegex.findAll(data).toList().reversed()) { | |
val type = match.groupValues[match.groupValues.lastIndex - 1] | |
var url = match.groupValues.last() | |
if (url.startsWith("data:")) { | |
continue | |
} | |
if (url.startsWith("/_next/")) { | |
url = "https://rs.runescape.com$url" | |
} | |
if (url.startsWith("/")) { | |
val host = source.substring(0, source.indexOf("runescape.com") + 13) | |
url = "${host}${if (host.endsWith("/") || url.startsWith("/")) "" else "/"}$url" | |
} | |
if (url.contains("runescape.com") || url.contains("jagex.com") || url.contains("ctfassets.net")) { | |
val other = getPath(url) ?: continue | |
queue(url) | |
data = data.replaceRange(match.range, "$type=\"${UrlHandler.offset(other, path.count { it == '/' })}\"") | |
} | |
} | |
for (match in noScriptRegex.findAll(data).toList().reversed()) { | |
val all = match.groupValues.first() | |
val url = match.groupValues.last() | |
data = data.replaceRange(match.range, all.replace("data:image/gif;base64,R0lGODlhAQABAIAAAAAAAPhttps:///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", url)) | |
} | |
if (data.contains(SiteMirror.AIT)) { | |
getPath(SiteMirror.AIT)?.let { url -> | |
data = data.replace(SiteMirror.AIT, UrlHandler.offset(url, path.count { it == '/' })) | |
queue("https:${SiteMirror.AIT}") | |
} | |
} | |
} | |
out.parentFile.mkdirs() | |
out.writeText(data, Charsets.UTF_8) | |
} else { | |
download(source, path) | |
} | |
} | |
private fun download(source: String, path: String) { | |
val data = getStream(source) { queue(it, true) } ?: return | |
val out = File(output, trimAnchor(path)) | |
if (!out.exists()) { | |
out.parentFile.mkdirs() | |
out.writeBytes(data.readBytes()) | |
} | |
} | |
private fun getStream(source: String, queue: (String) -> Unit): InputStream? { | |
val connection = URL(source).openConnection() as HttpURLConnection | |
connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36") | |
try { | |
val code = connection.responseCode | |
if (code != 200) { | |
if (code == 302 || code == 301) { | |
val redirect = connection.getHeaderField("Location") | |
return getStream(redirect, queue) | |
} else if (code == 503) { | |
queue.invoke(source) | |
} else if (code == 400) { | |
queue.invoke(source.replace("../../../../", "")) | |
} else if (code != 404 && code != 403) { | |
System.err.println("Error code $code $source") | |
} | |
return null | |
} | |
} catch (e: ConnectException) { | |
e.printStackTrace() | |
queue.invoke(source) | |
} | |
return if (connection.contentEncoding == "gzip") { | |
GZIPInputStream(connection.inputStream) | |
} else { | |
connection.inputStream | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
object UrlHandler { | |
fun offset(url: String, depth: Int): String { | |
return if (depth > 0) { | |
"${"../".repeat(depth)}$url" | |
} else { | |
url | |
} | |
} | |
fun trimQuery(url: String): String { | |
val index = url.indexOf("?") | |
if (index >= 0) { | |
return url.substring(0, index) | |
} | |
return url | |
} | |
fun trimAnchor(url: String): String { | |
val index = url.indexOf("#") | |
if (index >= 0) { | |
return url.substring(0, index) | |
} | |
return url | |
} | |
/** | |
* Removes the protocol and host from a url, converting subdomains (except www) to top level | |
*/ | |
fun removeDomain(url: String, host: String): String { | |
var url = url | |
val protocol = if (url.startsWith("http://")) "http://" else if (url.startsWith("https://")) "https://" else "" | |
url = url.removePrefix(protocol) | |
val web = "www." | |
if (url.startsWith(web)) { | |
url = url.removePrefix(web) | |
} | |
val end = url.indexOf("/") | |
val hostName = url.substring(0, end + 1) | |
url = url.replace(hostName, getTopLevelHost(hostName, host)) | |
return url | |
} | |
/** | |
* Removes the first domain when two exist in a url | |
*/ | |
fun removePrefixDomain(url: String): String { | |
var index = url.indexOf("http://", 1) | |
if (index != -1) { | |
return url.substring(index, url.length) | |
} | |
index = url.indexOf("https://", 1) | |
if (index != -1) { | |
return url.substring(index, url.length) | |
} | |
return url | |
} | |
fun removeSuffixDomain(url: String): String { | |
var index = url.indexOf("http://", 1) | |
if (index != -1) { | |
return url.substring(0, index) | |
} | |
index = url.indexOf("https://", 1) | |
if (index != -1) { | |
return url.substring(0, index) | |
} | |
return url | |
} | |
private fun getTopLevelHost(hostName: String, host: String): String { | |
return if (hostName.length > host.length + 1) { | |
hostName.replace(".$host", "") | |
} else { | |
hostName.replace("$host/", "") | |
} | |
} | |
/** | |
* Converts a urls query into a unique page name | |
*/ | |
fun convertQuery(url: String): String { | |
val index = url.lastIndexOf("?") | |
if (index >= 0) { | |
val sb = StringBuilder() | |
var lastIndex = url.lastIndexOf("/", index) | |
lastIndex = if (lastIndex == -1) 0 else lastIndex | |
var name = url.substring(lastIndex, index) | |
if (name.contains("?")) { | |
name = name.substring(0, name.lastIndexOf("?")) | |
} | |
var extension = "" | |
if (name.contains(".")) { | |
val parts = name.split(".") | |
name = parts.first() | |
extension = ".${parts.last()}" | |
} | |
if (lastIndex > 0) { | |
sb.append(url.substring(0, lastIndex).replace("?", "%3F")) | |
} | |
sb.append(name) | |
val anchorIndex = url.lastIndexOf("#", index) | |
val query = url.substring(index + 1, if (anchorIndex >= 0) anchorIndex else url.length) | |
for (pair in query.split("&")) { | |
if (pair.contains("=")) { | |
val parts = pair.split("=") | |
val key = parts.first() | |
val value = parts.last() | |
if (pair.startsWith("url=") && extension.isBlank()) { | |
extension = ".${pair.split(".").last()}" | |
} | |
sb.append("-").append(key).append("-").append(value) | |
} else { | |
sb.append("-").append(pair) | |
} | |
} | |
sb.append(extension) | |
if (anchorIndex >= 0) { | |
sb.append(url.substring(anchorIndex, url.length)) | |
} | |
return sb.toString() | |
} else { | |
return url | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment