Skip to content

Instantly share code, notes, and snippets.

@GregHib
Created February 3, 2022 21:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save GregHib/5be66f39c2f90a761618535a44a85d46 to your computer and use it in GitHub Desktop.
Save GregHib/5be66f39c2f90a761618535a44a85d46 to your computer and use it in GitHub Desktop.
Website Archiver
import world.gregs.voidps.tools.UrlHandler.convertQuery
import world.gregs.voidps.tools.UrlHandler.offset
import world.gregs.voidps.tools.UrlHandler.removeDomain
import world.gregs.voidps.tools.UrlHandler.removePrefixDomain
import world.gregs.voidps.tools.UrlHandler.removeSuffixDomain
import world.gregs.voidps.tools.UrlHandler.trimAnchor
import world.gregs.voidps.tools.UrlHandler.trimQuery
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.GlobalScope
import kotlinx.coroutines.launch
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import java.io.File
import java.net.HttpURLConnection
import java.net.URL
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.ConcurrentLinkedQueue
class SiteMirror(
date: String,
private val languages: Boolean,
private val knowledgeBase: Boolean,
private val downloads: Boolean,
private val singlePage: Boolean,
) {
private var year = date.take(4).toInt()
private val output = File("./$year-${date.substring(4, 6)}-${date.substring(6, 8)}/")
private val all: MutableSet<String> = ConcurrentHashMap.newKeySet()
private val queue = ConcurrentLinkedQueue<Pair<String, String>>()
private val validUrlRegex = "https?:\\\\?/\\\\?/wayback\\.archive-it\\.org\\\\?/all\\\\?/.*?\\\\?/https?:\\\\?/\\\\?/(?:[a-zA-Z0-9-.]+?)?(?:runescape|jagex).com".toRegex()
private val testRegex = "(?:https?:)?\\\\?/\\\\?/[-a-zA-Z0-9+&@#\\\\/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]".toRegex()
private val staticRegex = "\"(/wb-static/[-a-zA-Z0-9+&@#\\\\/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|])\"".toRegex()
init {
queue("http://wayback.archive-it.org/all/$date/http://www.runescape.com/", force = true)
}
fun next() {
val (req, path) = queue.poll() ?: return
GlobalScope.launch(Dispatchers.Default) {
grabPage(req, path)
}
}
private fun shouldSkip(path: String): Boolean {
val dateIndex = path.indexOf("all/")
val date = path.substring(dateIndex + 4, dateIndex + 8).toIntOrNull()
if (date != null && date < year) {
return true
}
if (singlePage && !supportedFileDownloads(trimQuery(trimAnchor(path)))) {
return true
}
if (!languages && (path.contains("l=") || path.contains("set_lang="))) {
return true
}
if (!knowledgeBase && path.contains("kbase")) {
return true
}
if (!downloads && path.contains("downloads_and_wallpapers")) {
return true
}
return false
}
private fun queue(archived: String, force: Boolean = false) {
if (!force && shouldSkip(archived)) {
return
}
val path = getPath(archived) ?: return
if (!all.contains(path)) {
queue.add(archived to path)
all.add(path)
}
}
private fun getPath(source: String): String? {
return if (validUrlRegex.containsMatchIn(source)) {
val anchorIndex = source.indexOf("#")
val anchor = if (anchorIndex >= 0) {
source.substring(anchorIndex, source.length)
} else ""
var path = convertQuery(removeDomain(removePrefixDomain(source.replace(anchor, "").replace(".ws", ".html")), "runescape.com"))
when {
path.isBlank() || path == "runescape.com" -> path = "index.html"
path.endsWith("/") -> path += "index.html"
!path.endsWith(".html") && !supportedFileDownloads(path) -> {
path += ".html"
}
}
path
} else if (!source.contains("http") && !supportedFileDownloads(source)) {
convertQuery(source)
} else if (source.contains("/wb-static")) {
source.substring(source.indexOf("/wb-static") + 1, source.length)
} else if (source.contains("partner.archive-it.org/static/")) {
source.substring(source.indexOf("/static/") + 1, source.length)
} else {
null
}
}
fun removeDisclaimer(document: Document) {
for (element in document.select("style").reversed()) {
if (element.data().contains("disclaim")) {
element.remove()
}
}
for (element in document.select("wb_div").reversed()) {
element.remove()
}
for (element in document.select("script").reversed()) {
val data = element.data()
// if (data.contains("wombat") || data.contains("disclaim") ||data.contains("ait", true) || data.contains("TimeShift")) {
// element.remove()
// }
// val src = element.attr("src")
// if (src.contains("ait", true)) {
// element.remove()
// }
}
}
private fun grabPage(source: String, path: String) {
val connection = URL(source).openConnection() as HttpURLConnection
if (connection.responseCode != 200) {
return
}
println("Grab $source $path ${isTextFormat(path)}")
if (isTextFormat(path)) {
var data = connection.inputStream.readBytes().toString(Charsets.UTF_8)
val prefix = removeSuffixDomain(source)
for (match in testRegex.findAll(data).toList().reversed()) {
var original = standardise(match.groupValues.last())
var url = original
if (url.contains("archive-it.org")) {
url = removePrefixDomain(url)
}
if (url.contains("runescape.com") || url.contains("jagex.com")) {
if (!original.contains("archive-it.org") && prefix.contains("archive-it.org")) {
original = "$prefix$original"
}
val other = getPath(original) ?: continue
url = offset(other, path.count { it == '/' })
queue(original)
}
if (url != original) {
data = data.replaceRange(match.range, url)
}
}
for (match in staticRegex.findAll(data).toList().reversed()) {
val original = standardise(match.groupValues.last())
queue.add("http://wayback.archive-it.org$original" to original)
all.add(original)
data = data.replaceRange(match.range, offset(getPath(original) ?: continue, path.count { it == '/' }))
}
if (data.contains(AIT)) {
getPath(AIT)?.let { url ->
data = data.replace(AIT, offset(url, path.count { it == '/' }))
queue("https:$AIT")
}
}
val out = File(output, trimAnchor(path))
out.parentFile.mkdirs()
if (out.extension == "html") {
val document = Jsoup.parse(data.replace("charset=iso-8859-1", "charset=utf-8"))
removeDisclaimer(document)
if (!data.contains("charset=")) {
document.head().appendElement("meta").attr("http-equiv", "Content-Type").attr("content", "text/html;charset=utf-8")
}
out.writeText(document.toString(), Charsets.UTF_8)
} else {
out.writeText(data)
}
} else {
val out = File(output, trimAnchor(path))
if (!out.exists()) {
out.parentFile.mkdirs()
out.writeBytes(connection.inputStream.readBytes())
}
}
}
private fun standardise(url: String): String {
var url = url
if (url.startsWith("//")) {
url = "https:$url"
}
return url.replace("\\/", "/")
}
companion object {
const val AIT = "//partner.archive-it.org/static/AIT_Analytics.js"
fun isTextFormat(url: String): Boolean = url.endsWith(".ws", true) || url.endsWith(".html", true) || url.endsWith(".css", true) || url.endsWith(".js", true)
fun supportedFileDownloads(file: String): Boolean {
return file.endsWith(".exe", true)
|| file.endsWith(".msi", true)
|| file.endsWith(".mp3", true)
|| file.endsWith(".gif", true)
|| file.endsWith(".jpg", true)
|| file.endsWith(".png", true)
|| file.endsWith(".bz2", true)
|| file.endsWith(".zip", true)
|| file.endsWith(".tar", true)
|| file.endsWith(".jar", true)
|| file.endsWith(".ico", true)
|| file.endsWith(".rss", true)
|| file.endsWith(".css", true)
|| file.endsWith(".js", true)
|| file.endsWith(".json", true)
|| file.endsWith(".svg", true)
|| file.endsWith(".dmg", true)
|| file.endsWith(".woff", true)
|| file.endsWith(".woff2", true)
|| file.endsWith(".ttf", true)
|| file.endsWith(".eot", true)
|| file.endsWith(".webp", true)
|| file.endsWith(".webm", true)
|| file.endsWith(".mp4", true)
}
}
}
import world.gregs.voidps.tools.SiteMirror.Companion.isTextFormat
import world.gregs.voidps.tools.SiteMirror.Companion.supportedFileDownloads
import world.gregs.voidps.tools.UrlHandler.convertQuery
import world.gregs.voidps.tools.UrlHandler.removeDomain
import world.gregs.voidps.tools.UrlHandler.trimAnchor
import world.gregs.voidps.tools.UrlHandler.trimQuery
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.GlobalScope
import kotlinx.coroutines.launch
import java.io.File
import java.io.InputStream
import java.net.ConnectException
import java.net.HttpURLConnection
import java.net.URL
import java.net.URLDecoder
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.ConcurrentLinkedQueue
import java.util.zip.GZIPInputStream
private val testRegex = "(?:https?:)?\\\\?/\\\\?/[-a-zA-Z0-9+&@#\\\\/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]".toRegex()
private val srcRegex = "(src|href)=\"(/.*?)\"".toRegex()
private val noScriptRegex = "src=\":///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7\".*?<noscript><img.*?src=\"(.*?)\"".toRegex()
private val urlRegex = "(url\\(\"?'?)(.*?)'?\"?\\)".toRegex()
private val interchangeRegex = "\\[(/.*?), ([a-z]+)]".toRegex()
private val output = File("./live/")
fun main() {
val scrapper = SiteMirrorLive()
scrapper.queue("https://runescape.com/", force = true)
scrapper.queue("https://play.runescape.com/", force = true)
scrapper.queue("https://play.runescape.com/runescape", force = true)
scrapper.queue("https://play.runescape.com/oldschool", force = true)
scrapper.queue("https://play.runescape.com/returning-players", force = true)
scrapper.queue("https://rs.runescape.com/premier-club", force = true)
scrapper.queue("https://secure.runescape.com/m=forum/forums", force = true)// Forums
scrapper.queue("https://secure.runescape.com/m=forum/forums?294,295,thd,391,66049913", force = true)// Forum category
scrapper.queue("https://secure.runescape.com/m=forum/forums?294,295,396,66126612", force = true)// Forum thread
val user = "2+Taint3d"
scrapper.queue("https://apps.runescape.com/runemetrics/app/welcome", force = true)// Runemetrics
scrapper.queue("https://apps.runescape.com/runemetrics/app/overview/player/$user", force = true)// Runemetrics user
scrapper.queue("https://apps.runescape.com/runemetrics/app/levels/player/$user", force = true)// Runemetrics levels
scrapper.queue("https://apps.runescape.com/runemetrics/app/xp-monthly/player/$user/-1", force = true)// Runemetrics xp
scrapper.queue("https://apps.runescape.com/runemetrics/app/xp-monthly/player/$user/-1", force = true)// Runemetrics xp
scrapper.queue("https://apps.runescape.com/runemetrics/app/activities/player/$user", force = true)// Runemetrics event log
scrapper.queue("https://apps.runescape.com/runemetrics/app/quests/player/$user", force = true)// Runemetrics quests
scrapper.queue("https://secure.runescape.com/m=itemdb_rs/results", force = true)// GE search
scrapper.queue("https://secure.runescape.com/m=itemdb_rs/", force = true)// GE
scrapper.queue("https://secure.runescape.com/m=itemdb_rs/catalogue", force = true) // GE catalogue
scrapper.queue("https://secure.runescape.com/m=itemdb_rs/catalogue?cat=32", force = true)// GE catalogue
scrapper.queue("https://secure.runescape.com/m=itemdb_rs/top100?list=2", force = true)// GE price rises
scrapper.queue("https://secure.runescape.com/m=itemdb_rs/Santa+hat/viewitem?obj=1050", force = true)// GE item
scrapper.queue("https://secure.runescape.com/m=hiscore/", force = true) // Hiscores
scrapper.queue("https://secure.runescape.com/m=hiscore/ranking", force = true) // Hiscores
scrapper.queue("https://secure.runescape.com/m=hiscore/ranking?category_type=1&table=26", force = true)// Achievements
scrapper.queue("http://services.runescape.com/m=clan-hiscores/ranking", force = true) // Clans
scrapper.queue("http://services.runescape.com/m=clan-hiscores/landing.ws", force = true) // Clans
scrapper.queue("http://services.runescape.com/m=clan-home/clan/Efficiency+Experts", force = true) // Clan info
scrapper.queue("http://services.runescape.com/m=clan-hiscores/compare.ws?clanName=Efficiency+Experts", force = true) // Clan stats
scrapper.queue("http://services.runescape.com/m=clan-hiscores/members.ws?clanName=Efficiency+Experts", force = true) // Clan mates
scrapper.queue("http://services.runescape.com/m=temp-hiscores/", force = true)// Seasonal
scrapper.queue("http://services.runescape.com/m=temp-hiscores/ranking?id=1508716800045&filter=-1&page=1", force = true)// Seasonal topic
scrapper.queue("http://services.runescape.com/m=temp-hiscores/compare?user1=Sina", force = true)// Seasonal player
scrapper.queue("https://secure.runescape.com/m=hiscore/ranking?category_type=1", force = true)// Activities
scrapper.queue("https://secure.runescape.com/m=hiscore/compare?user1=Blacked+Out&category_type=1", force = true)// Player activities
scrapper.queue("https://secure.runescape.com/m=hiscore/ranking?category_type=0&table=0&time_filter=1&page=1", force = true)
scrapper.queue("https://services.runescape.com/m=hiscore/compare?user1=le+me", force = true) // Player
scrapper.queue("https://secure.runescape.com/m=hiscore/compare", force = true) // Player
scrapper.queue("https://oldschool.runescape.com/premier-club/", force = true)
scrapper.queue("https://secure.runescape.com/m=poll/", force = true) // Player power
scrapper.queue("https://secure.runescape.com/m=poll/archive?id=1596", force = true) // Poll
scrapper.queue("https://support.runescape.com/hc/en-gb", force = true) // Support
scrapper.queue("https://support.runescape.com/hc/en-gb/categories/200977391-Your-account", force = true) // Category
scrapper.queue("https://support.runescape.com/hc/en-gb/articles/360001313349-Go-here-to-install", force = true) // Post
scrapper.queue("https://secure.runescape.com/m=news/", force = true) // News
scrapper.queue("https://secure.runescape.com/m=news/archive?oldschool=1", force = true) // News
scrapper.queue("https://secure.runescape.com/m=news/shooting-stars-poll-blog?oldschool=1", force = true) // osrs news post
scrapper.queue("https://secure.runescape.com/m=news/8th-birthday-event-and-lms-changes?oldschool=1", force = true) // osrs news post
scrapper.queue("https://secure.runescape.com/m=news/the-isle-of-souls?oldschool=1", force = true) // osrs news post
scrapper.queue("https://secure.runescape.com/m=news/equipment-rebalancing-updated?oldschool=1", force = true) // osrs news post
scrapper.queue("https://secure.runescape.com/m=news/equipment-rebalance-postponed?oldschool=1", force = true) // osrs news post
scrapper.queue("https://secure.runescape.com/m=news/below-ice-mountain-poll-blog?oldschool=1", force = true) // osrs news post
scrapper.queue("https://secure.runescape.com/m=itemdb_oldschool/", force = true)// GE
scrapper.queue("https://secure.runescape.com/m=itemdb_oldschool/top100?list=1&scale=0", force = true)// GE most traded
scrapper.queue("https://secure.runescape.com/m=itemdb_oldschool/Santa+hat/viewitem?obj=1050", force = true)// GE item
scrapper.queue("https://secure.runescape.com/m=poll/oldschool/", force = true)// poll
scrapper.queue("https://secure.runescape.com/m=poll/oldschool/results?id=1616", force = true)// poll results
scrapper.queue("https://secure.runescape.com/m=hiscore_oldschool/overall", force = true)// hiscores
scrapper.queue("https://secure.runescape.com/m=hiscore_oldschool/hiscorepersonal?user1=Lynx Titan", force = true)// hiscore user
while (true) {
scrapper.next()
}
}
class SiteMirrorLive {
private val all: MutableSet<String> = ConcurrentHashMap.newKeySet()
private val queue = ConcurrentLinkedQueue<Pair<String, String>>()
private val validUrlRegex = "https?:\\\\?/\\\\?/(?:[a-zA-Z0-9-.]+?)?(?:runescape.com|jagex.com|ctfassets.net|zdassets.com)".toRegex()
private val singlePage = false
fun shouldSkip(path: String): Boolean {
if (singlePage && isTextFormat(trimQuery(trimAnchor(path)))) {
return true
}
if ((!path.contains("url=") && path.contains("l=")) || path.contains("set_lang=") || path.contains("de-DE") || path.contains("fr-FR") || path.contains("pt-BR") || path.contains("zh-CN") || path.contains("ja-JP") || path.contains("ko-KR") || path.contains("pl-PL")) {
return true
}
if (path.contains("Incapsula")) {
return true
}
if (path.contains("m=forums")) {
return true
}
if (path.contains("m=forum")) {
return true
}
if (path.contains("m=hiscore")) {
return true
}
if (path.contains("m=hiscore_oldschool")) {
return true
}
if (path.contains("/runemetrics/")) {
return true
}
if (path.contains("m=itemdb_rs")) {
return true
}
if (path.contains("m=itemdb_oldschool")) {
return true
}
if (path.contains("m=clan-hiscores")) {
return true
}
if (path.contains("m=temp-hiscores")) {
return true
}
if (path.contains("m=poll")) {
return true
}
if (path.contains("support.runescape.com")) {
return true
}
if (path.contains("?world=")) {
return true
}
return false
}
private val regex = "(/[^./]+?/\\.\\.)/".toRegex()
fun queue(archived: String, force: Boolean = false) {
val archived = URLDecoder.decode(archived, Charsets.UTF_8)
.replace(regex, "/")
.replace(" ", "+")
if (!force && shouldSkip(archived)) {
return
}
val path = getPath(archived) ?: return
if (!all.contains(path)) {
queue.add(archived to path)
all.add(path)
}
}
fun next() {
val (req, path) = queue.poll() ?: return
GlobalScope.launch(Dispatchers.Default) {
grabPage(req, path)
}
}
private fun getPath(source: String): String? {
if (validUrlRegex.containsMatchIn(source)) {
var path: String = convertQuery(removeDomain(source
.replace("/#/", "/")
.replace(".ws", ".html")
.replace(" ", "+"),
when {
source.contains("static.zdassets.com") -> "static.zdassets.com"
source.contains("ctfassets.net") -> "ctfassets.net"
source.contains("jagex.com") -> "jagex.com"
source.contains("rs.runescape.com") -> "runescape.com"
else -> "runescape.com"
})
).replace(":", "-")
when {
path.isBlank() || path == "runescape.com" -> path = "index.html"
path.endsWith("/") -> path += "index.html"
!isTextFormat(path) && !supportedFileDownloads(trimAnchor(path)) -> {
path += "/index.html"
}
}
return path
}
return null
}
private fun grabPage(source: String, path: String) {
println("Grab $source $path")
if (isTextFormat(trimAnchor(path))) {
var data = getStream(source) { queue(it, true) }?.readBytes()?.toString(Charsets.UTF_8) ?: return
data = data
.replace("amp;", "")
.replace("&amp;", "&")
.replace("%2F", "/")
val out = File(output, trimAnchor(path))
if (trimAnchor(path).endsWith("css")) {
for (match in urlRegex.findAll(data).toList().reversed()) {
val type = match.groupValues[match.groupValues.lastIndex - 1]
var url = match.groupValues.last()
if (url.startsWith("data:")) {
continue
}
if (!url.startsWith("/") && !url.startsWith("http")) {
url = "/$url"
}
if (url.startsWith("/")) {
url = "${source.substring(0, source.indexOf("runescape.com") + 13)}$url"
}
if (url.contains("runescape.com") || url.contains("jagex.com") || url.contains("static.zdassets.com") || url.contains("ctfassets.net")) {
val other = getPath(trimQuery(trimAnchor(url))) ?: continue
queue(trimQuery(trimAnchor(url)))
data = data.replaceRange(match.range, "$type${UrlHandler.offset(other, path.count { it == '/' })}${if (type.last() == '\'') "'" else if (type.last() == '"') "\"" else ""})")
}
}
} else {
for (match in testRegex.findAll(data).toList().reversed()) {
val original = match.groupValues.last()
var url = original
if (url.startsWith("data:")) {
continue
}
if (url.startsWith("//")) {
url = "https:$url"
}
if (url.contains("runescape.com") || url.contains("jagex.com") || url.contains("static.zdassets.com") || url.contains("ctfassets.net")) {
val other = getPath(original) ?: continue
url = UrlHandler.offset(other, path.count { it == '/' })
queue(original)
}
if (url != original) {
data = data.replaceRange(match.range, url)
}
}
for (match in interchangeRegex.findAll(data).toList().reversed()) {
var url = match.groupValues[match.groupValues.lastIndex - 1]
val size = match.groupValues.last()
if (url.startsWith("data:")) {
continue
}
if (url.startsWith("/")) {
val host = source.substring(0, source.indexOf("runescape.com") + 13)
url = "${host}${if (host.endsWith("/") || url.startsWith("/")) "" else "/"}$url"
}
if (url.contains("runescape.com") || url.contains("jagex.com") || url.contains("ctfassets.net")) {
val other = getPath(url) ?: continue
queue(url)
data = data.replaceRange(match.range, "[${UrlHandler.offset(other, path.count { it == '/' })}, $size]")
}
}
for (match in srcRegex.findAll(data).toList().reversed()) {
val type = match.groupValues[match.groupValues.lastIndex - 1]
var url = match.groupValues.last()
if (url.startsWith("data:")) {
continue
}
if (url.startsWith("/_next/")) {
url = "https://rs.runescape.com$url"
}
if (url.startsWith("/")) {
val host = source.substring(0, source.indexOf("runescape.com") + 13)
url = "${host}${if (host.endsWith("/") || url.startsWith("/")) "" else "/"}$url"
}
if (url.contains("runescape.com") || url.contains("jagex.com") || url.contains("ctfassets.net")) {
val other = getPath(url) ?: continue
queue(url)
data = data.replaceRange(match.range, "$type=\"${UrlHandler.offset(other, path.count { it == '/' })}\"")
}
}
for (match in noScriptRegex.findAll(data).toList().reversed()) {
val all = match.groupValues.first()
val url = match.groupValues.last()
data = data.replaceRange(match.range, all.replace(":///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", url))
}
if (data.contains(SiteMirror.AIT)) {
getPath(SiteMirror.AIT)?.let { url ->
data = data.replace(SiteMirror.AIT, UrlHandler.offset(url, path.count { it == '/' }))
queue("https:${SiteMirror.AIT}")
}
}
}
out.parentFile.mkdirs()
out.writeText(data, Charsets.UTF_8)
} else {
download(source, path)
}
}
private fun download(source: String, path: String) {
val data = getStream(source) { queue(it, true) } ?: return
val out = File(output, trimAnchor(path))
if (!out.exists()) {
out.parentFile.mkdirs()
out.writeBytes(data.readBytes())
}
}
private fun getStream(source: String, queue: (String) -> Unit): InputStream? {
val connection = URL(source).openConnection() as HttpURLConnection
connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36")
try {
val code = connection.responseCode
if (code != 200) {
if (code == 302 || code == 301) {
val redirect = connection.getHeaderField("Location")
return getStream(redirect, queue)
} else if (code == 503) {
queue.invoke(source)
} else if (code == 400) {
queue.invoke(source.replace("../../../../", ""))
} else if (code != 404 && code != 403) {
System.err.println("Error code $code $source")
}
return null
}
} catch (e: ConnectException) {
e.printStackTrace()
queue.invoke(source)
}
return if (connection.contentEncoding == "gzip") {
GZIPInputStream(connection.inputStream)
} else {
connection.inputStream
}
}
}
object UrlHandler {
fun offset(url: String, depth: Int): String {
return if (depth > 0) {
"${"../".repeat(depth)}$url"
} else {
url
}
}
fun trimQuery(url: String): String {
val index = url.indexOf("?")
if (index >= 0) {
return url.substring(0, index)
}
return url
}
fun trimAnchor(url: String): String {
val index = url.indexOf("#")
if (index >= 0) {
return url.substring(0, index)
}
return url
}
/**
* Removes the protocol and host from a url, converting subdomains (except www) to top level
*/
fun removeDomain(url: String, host: String): String {
var url = url
val protocol = if (url.startsWith("http://")) "http://" else if (url.startsWith("https://")) "https://" else ""
url = url.removePrefix(protocol)
val web = "www."
if (url.startsWith(web)) {
url = url.removePrefix(web)
}
val end = url.indexOf("/")
val hostName = url.substring(0, end + 1)
url = url.replace(hostName, getTopLevelHost(hostName, host))
return url
}
/**
* Removes the first domain when two exist in a url
*/
fun removePrefixDomain(url: String): String {
var index = url.indexOf("http://", 1)
if (index != -1) {
return url.substring(index, url.length)
}
index = url.indexOf("https://", 1)
if (index != -1) {
return url.substring(index, url.length)
}
return url
}
fun removeSuffixDomain(url: String): String {
var index = url.indexOf("http://", 1)
if (index != -1) {
return url.substring(0, index)
}
index = url.indexOf("https://", 1)
if (index != -1) {
return url.substring(0, index)
}
return url
}
private fun getTopLevelHost(hostName: String, host: String): String {
return if (hostName.length > host.length + 1) {
hostName.replace(".$host", "")
} else {
hostName.replace("$host/", "")
}
}
/**
* Converts a urls query into a unique page name
*/
fun convertQuery(url: String): String {
val index = url.lastIndexOf("?")
if (index >= 0) {
val sb = StringBuilder()
var lastIndex = url.lastIndexOf("/", index)
lastIndex = if (lastIndex == -1) 0 else lastIndex
var name = url.substring(lastIndex, index)
if (name.contains("?")) {
name = name.substring(0, name.lastIndexOf("?"))
}
var extension = ""
if (name.contains(".")) {
val parts = name.split(".")
name = parts.first()
extension = ".${parts.last()}"
}
if (lastIndex > 0) {
sb.append(url.substring(0, lastIndex).replace("?", "%3F"))
}
sb.append(name)
val anchorIndex = url.lastIndexOf("#", index)
val query = url.substring(index + 1, if (anchorIndex >= 0) anchorIndex else url.length)
for (pair in query.split("&")) {
if (pair.contains("=")) {
val parts = pair.split("=")
val key = parts.first()
val value = parts.last()
if (pair.startsWith("url=") && extension.isBlank()) {
extension = ".${pair.split(".").last()}"
}
sb.append("-").append(key).append("-").append(value)
} else {
sb.append("-").append(pair)
}
}
sb.append(extension)
if (anchorIndex >= 0) {
sb.append(url.substring(anchorIndex, url.length))
}
return sb.toString()
} else {
return url
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment