Skip to content

Instantly share code, notes, and snippets.

@dom96
Created April 6, 2014 18:18
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save dom96/10009691 to your computer and use it in GitHub Desktop.
Save dom96/10009691 to your computer and use it in GitHub Desktop.
import
asyncdispatch, asyncnet, htmlparser, xmltree, httpclient, strutils,
strtabs, streams, uri, sets
var visited = initSet[string]()
proc crawl(url: string, client: PAsyncHttpClient = newAsyncHttpClient()) {.async.} =
if url in visited: return # Already visited this URL.
echo("Crawling ", url)
visited.incl(url)
let resp = await client.get(url)
if resp.status.startswith("200") and
resp.headers["Content-Type"] == "text/html":
let html = parseHtml(newStringStream(resp.body))
for a in html.findAll("a"):
let href = a.attrs["href"]
if href != "":
if href.startswith("http://"):
crawl(href)
else:
let fullUrl = TUrl(url) / TUrl(href)
# We reuse this client because the connection is kept alive.
await crawl($fullUrl, client)
crawl("http://nimrod-lang.org")
runForever()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment