Created
January 18, 2019 10:33
-
-
Save mbszarek/0a9e560349c500167f297243052eb9bb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package webcrawler | |
import java.io.File | |
import java.net.URL | |
import java.nio.charset.Charset | |
import java.nio.file.{Files, Paths} | |
import java.util.{Comparator, Scanner} | |
import monix.eval.Task | |
import monix.execution.Scheduler.Implicits.global | |
import org.htmlcleaner.{HtmlCleaner, TagNode} | |
import scala.concurrent.Await | |
import scala.concurrent.duration.Duration | |
import scala.io.Source | |
import scala.util.Try | |
object WebCrawler extends App { | |
implicit val cleaner: HtmlCleaner = new HtmlCleaner | |
val props = cleaner.getProperties | |
val pathRegex = """^(/.*)""".r | |
val httpRegex = """^(http.*)""".r | |
def saveContent(url: URL): Task[Unit] = | |
Task { | |
import scala.io.Codec | |
val path = Paths.get("output", s"${url.getHost}${url.getPath}".replaceAll("\\.|/", "_")) | |
( | |
Files.newBufferedWriter(path, Charset.forName("UTF-8")), | |
Try { | |
Source.fromURL(url)(Codec.UTF8).mkString | |
}.getOrElse(Source.fromURL(url)(Codec.ISO8859).mkString) | |
) | |
}.bracket { | |
case (in, content) => | |
Task { | |
in.write(content) | |
} | |
} { | |
case (in, _) => | |
Task { | |
in.close() | |
} | |
} | |
def analyzeUrl(url: URL, level: Int)(implicit cleaner: HtmlCleaner): Task[Unit] = | |
Task { | |
if (level >= 0) { | |
val node = cleaner.clean(url) | |
val operations = for { | |
_ <- saveContent(url) | |
_ <- Task.gatherUnordered { | |
analyzeSubUrls(node, level, url) | |
} | |
} yield () | |
Await.ready(operations.runToFuture, Duration.Inf) | |
} | |
} | |
def analyzeSubUrls(node: TagNode, level: Int, url: URL)(implicit cleaner: HtmlCleaner): Seq[Task[Unit]] = { | |
import scala.collection.JavaConverters._ | |
for { | |
element <- node.getElementListByName("a", true).asScala | |
link = element.getAttributeByName("href") | |
} yield { | |
val x = link match { | |
case pathRegex(value) => | |
new URL(url, value) | |
case httpRegex(_) => | |
new URL(link) | |
case value => | |
new URL(s"$url/$value") | |
} | |
analyzeUrl(x, level - 1) | |
} | |
} | |
def getInput: Task[(String, Int)] = | |
Task { | |
new Scanner(System.in) | |
}.bracket { in => | |
Task { | |
println("Enter URL:") | |
val url = in.nextLine() | |
println("Enter max depth:") | |
val depth = in.nextInt() | |
(url, depth) | |
} | |
} { in => | |
Task { | |
in.close() | |
} | |
} | |
def main: Task[Unit] = for { | |
_ <- Task { | |
Files.walk(Paths.get("output")) | |
.sorted(Comparator.reverseOrder()) | |
.map[File](_.toFile) | |
.forEach(file => file.delete()) | |
}.onErrorRecover { | |
case _ => () | |
} | |
_ <- Task { | |
Files.createDirectories(Paths.get("output")) | |
} | |
tuple <- getInput | |
(urlString, depth) = tuple | |
url = new URL(urlString) | |
_ <- analyzeUrl(url, depth) | |
} yield () | |
Await.ready(main.runToFuture, Duration.Inf) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment