Created
April 6, 2014 18:18
-
-
Save dom96/10009691 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import | |
asyncdispatch, asyncnet, htmlparser, xmltree, httpclient, strutils, | |
strtabs, streams, uri, sets | |
var visited = initSet[string]() | |
proc crawl(url: string, client: PAsyncHttpClient = newAsyncHttpClient()) {.async.} = | |
if url in visited: return # Already visited this URL. | |
echo("Crawling ", url) | |
visited.incl(url) | |
let resp = await client.get(url) | |
if resp.status.startswith("200") and | |
resp.headers["Content-Type"] == "text/html": | |
let html = parseHtml(newStringStream(resp.body)) | |
for a in html.findAll("a"): | |
let href = a.attrs["href"] | |
if href != "": | |
if href.startswith("http://"): | |
crawl(href) | |
else: | |
let fullUrl = TUrl(url) / TUrl(href) | |
# We reuse this client because the connection is kept alive. | |
await crawl($fullUrl, client) | |
crawl("http://nimrod-lang.org") | |
runForever() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment