Skip to content

Instantly share code, notes, and snippets.

@indraniel
Forked from dom96/crawler.nim
Created May 29, 2018 16:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save indraniel/3af0358d25934d973b2c9563d587b536 to your computer and use it in GitHub Desktop.
Save indraniel/3af0358d25934d973b2c9563d587b536 to your computer and use it in GitHub Desktop.
import
asyncdispatch, asyncnet, htmlparser, xmltree, httpclient, strutils,
strtabs, streams, uri, sets
var visited = initSet[string]()
proc crawl(url: string, client: PAsyncHttpClient = newAsyncHttpClient()) {.async.} =
if url in visited: return # Already visited this URL.
echo("Crawling ", url)
visited.incl(url)
let resp = await client.get(url)
if resp.status.startswith("200") and
resp.headers["Content-Type"] == "text/html":
let html = parseHtml(newStringStream(resp.body))
for a in html.findAll("a"):
let href = a.attrs["href"]
if href != "":
if href.startswith("http://"):
crawl(href)
else:
let fullUrl = TUrl(url) / TUrl(href)
# We reuse this client because the connection is kept alive.
await crawl($fullUrl, client)
crawl("http://nimrod-lang.org")
runForever()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment