Skip to content

Instantly share code, notes, and snippets.

@iantbutler01
Last active February 5, 2020 06:49
Show Gist options
  • Save iantbutler01/981d4cf15a20f60385de64e4d6ac8f31 to your computer and use it in GitHub Desktop.
Save iantbutler01/981d4cf15a20f60385de64e4d6ac8f31 to your computer and use it in GitHub Desktop.
def work(depth_limit \\ 5) do
case Queue.pop() do
:empty ->
:ok
{:value, [url, depth]} ->
unless depth > depth_limit do
case request_page(url) do
#You can add more robust error handling here, typically If the error is an http error
#then it means the url is likely not crawlable and not worth retrying but
#I usually break it out by code and atleast log the specific error.
{:error, _} ->
:ok
{:ok, body} ->
File.mkdir_p!("/tmp/toy_crawler_results")
file_hash = Base.encode32(:crypto.hash(:sha256, body))
File.write!("/tmp/toy_crawler_results/#{file_hash}.html", body)
get_children_urls(body)
|> Enum.map(fn c_url ->
Queue.push([depth+1, c_url])
end)
:ok
end
else
:ok
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment