Skip to content

Instantly share code, notes, and snippets.

@mbszarek
Created January 18, 2019 10:33
Show Gist options
  • Save mbszarek/0a9e560349c500167f297243052eb9bb to your computer and use it in GitHub Desktop.
Save mbszarek/0a9e560349c500167f297243052eb9bb to your computer and use it in GitHub Desktop.
package webcrawler
import java.io.File
import java.net.URL
import java.nio.charset.Charset
import java.nio.file.{Files, Paths}
import java.util.{Comparator, Scanner}
import monix.eval.Task
import monix.execution.Scheduler.Implicits.global
import org.htmlcleaner.{HtmlCleaner, TagNode}
import scala.concurrent.Await
import scala.concurrent.duration.Duration
import scala.io.Source
import scala.util.Try
object WebCrawler extends App {
implicit val cleaner: HtmlCleaner = new HtmlCleaner
val props = cleaner.getProperties
val pathRegex = """^(/.*)""".r
val httpRegex = """^(http.*)""".r
def saveContent(url: URL): Task[Unit] =
Task {
import scala.io.Codec
val path = Paths.get("output", s"${url.getHost}${url.getPath}".replaceAll("\\.|/", "_"))
(
Files.newBufferedWriter(path, Charset.forName("UTF-8")),
Try {
Source.fromURL(url)(Codec.UTF8).mkString
}.getOrElse(Source.fromURL(url)(Codec.ISO8859).mkString)
)
}.bracket {
case (in, content) =>
Task {
in.write(content)
}
} {
case (in, _) =>
Task {
in.close()
}
}
def analyzeUrl(url: URL, level: Int)(implicit cleaner: HtmlCleaner): Task[Unit] =
Task {
if (level >= 0) {
val node = cleaner.clean(url)
val operations = for {
_ <- saveContent(url)
_ <- Task.gatherUnordered {
analyzeSubUrls(node, level, url)
}
} yield ()
Await.ready(operations.runToFuture, Duration.Inf)
}
}
def analyzeSubUrls(node: TagNode, level: Int, url: URL)(implicit cleaner: HtmlCleaner): Seq[Task[Unit]] = {
import scala.collection.JavaConverters._
for {
element <- node.getElementListByName("a", true).asScala
link = element.getAttributeByName("href")
} yield {
val x = link match {
case pathRegex(value) =>
new URL(url, value)
case httpRegex(_) =>
new URL(link)
case value =>
new URL(s"$url/$value")
}
analyzeUrl(x, level - 1)
}
}
def getInput: Task[(String, Int)] =
Task {
new Scanner(System.in)
}.bracket { in =>
Task {
println("Enter URL:")
val url = in.nextLine()
println("Enter max depth:")
val depth = in.nextInt()
(url, depth)
}
} { in =>
Task {
in.close()
}
}
def main: Task[Unit] = for {
_ <- Task {
Files.walk(Paths.get("output"))
.sorted(Comparator.reverseOrder())
.map[File](_.toFile)
.forEach(file => file.delete())
}.onErrorRecover {
case _ => ()
}
_ <- Task {
Files.createDirectories(Paths.get("output"))
}
tuple <- getInput
(urlString, depth) = tuple
url = new URL(urlString)
_ <- analyzeUrl(url, depth)
} yield ()
Await.ready(main.runToFuture, Duration.Inf)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment