Skip to content

Instantly share code, notes, and snippets.

@marquesds
Created March 2, 2021 12:44
Show Gist options
  • Save marquesds/facfdf8c7e1b9c94bec4f362f7abd142 to your computer and use it in GitHub Desktop.
Save marquesds/facfdf8c7e1b9c94bec4f362f7abd142 to your computer and use it in GitHub Desktop.
A simple web scraping with Kotlin
// Crawler.kt
import kotlinx.coroutines.*
import org.json.JSONObject
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
class Crawler {
suspend fun pipeline(urls: List<String>, selector: String): List<Elements> = coroutineScope {
retrieveAllHtmlDocumentAsync(urls).awaitAll().map { parse(it, selector) }
}
private fun parse(document: Document, selector: String): Elements {
return document.select(selector)
}
private suspend fun retrieveAllHtmlDocumentAsync(urls: List<String>): List<Deferred<Document>> = coroutineScope {
urls.map { url ->
async {
Jsoup.parse(khttp.get(url).text)
}
}
}
private fun toJson(element: Element): JSONObject {
return JSONObject(element)
}
}
// Skrotlin.kt - main
import kotlin.system.measureTimeMillis
suspend fun main(args: Array<String>) {
val vanDelay = "http://slowwly.robertomurray.co.uk/delay/3000/url"
val totalTime = measureTimeMillis {
val crawler = Crawler()
val result = crawler.pipeline(listOf("$vanDelay/https://http.cat",
"$vanDelay/https://httpstatusdogs.com/",
"$vanDelay/https://gympass.com",
"$vanDelay/https://google.com",
"$vanDelay/https://stackoverflow.com"), "div")
println(result)
println(result.size)
}
println("total time: $totalTime")
}
@marquesds
Copy link
Author

Maven dependencies:

<repositories>
    <repository>
        <id>jitpack.io</id>
        <url>https://jitpack.io</url>
    </repository>
</repositories>

<dependencies>
    <dependency>
        <groupId>org.jetbrains.kotlin</groupId>
        <artifactId>kotlin-stdlib</artifactId>
        <version>${kotlin.version}</version>
    </dependency>
    <dependency>
        <groupId>org.jetbrains.kotlin</groupId>
        <artifactId>kotlin-test-junit</artifactId>
        <version>${kotlin.version}</version>
        <scope>test</scope>
    </dependency>
    <dependency>
        <groupId>junit</groupId>
        <artifactId>junit</artifactId>
        <version>${junit.version}</version>
        <scope>test</scope>
    </dependency>
    <dependency>
        <groupId>com.github.jkcclemens</groupId>
        <artifactId>khttp</artifactId>
        <version>0.1.0</version>
    </dependency>
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.13.1</version>
    </dependency>
    <dependency>
        <groupId>org.jetbrains.kotlinx</groupId>
        <artifactId>kotlinx-coroutines-core</artifactId>
        <version>1.4.2</version>
    </dependency>
</dependencies>

@marquesds
Copy link
Author

Making parse async:

import kotlinx.coroutines.*
import org.json.JSONObject
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.select.Elements

class Crawler {
    suspend fun pipeline(urls: List<String>, selector: String): List<Elements> = coroutineScope {
        retrieveAllHtmlDocumentAsync(urls, selector).awaitAll()
    }

    private suspend fun retrieveAllHtmlDocumentAsync(urls: List<String>, selector: String): List<Deferred<Elements>> = coroutineScope {
        urls.map { url ->
            async {
                parse(Jsoup.parse(khttp.get(url).text), selector)
            }
        }
    }

    private fun parse(document: Document, selector: String): Elements {
        return document.select(selector)
    }

    private fun toJson(element: Element): JSONObject {
        return JSONObject(element)
    }

}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment