-
-
Save indraniel/3af0358d25934d973b2c9563d587b536 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import | |
asyncdispatch, asyncnet, htmlparser, xmltree, httpclient, strutils, | |
strtabs, streams, uri, sets | |
var visited = initSet[string]() | |
proc crawl(url: string, client: PAsyncHttpClient = newAsyncHttpClient()) {.async.} = | |
if url in visited: return # Already visited this URL. | |
echo("Crawling ", url) | |
visited.incl(url) | |
let resp = await client.get(url) | |
if resp.status.startswith("200") and | |
resp.headers["Content-Type"] == "text/html": | |
let html = parseHtml(newStringStream(resp.body)) | |
for a in html.findAll("a"): | |
let href = a.attrs["href"] | |
if href != "": | |
if href.startswith("http://"): | |
crawl(href) | |
else: | |
let fullUrl = TUrl(url) / TUrl(href) | |
# We reuse this client because the connection is kept alive. | |
await crawl($fullUrl, client) | |
crawl("http://nimrod-lang.org") | |
runForever() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment