Skip to content

Instantly share code, notes, and snippets.

@Manzanit0
Last active July 25, 2022 08:58
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save Manzanit0/ccc8419319cca1fa7e7486bfda5e3885 to your computer and use it in GitHub Desktop.
Save Manzanit0/ccc8419319cca1fa7e7486bfda5e3885 to your computer and use it in GitHub Desktop.
Web crawler which uses Floki and HTTPoison – does 5 request at a time
# Dependencies:
# {:httpoison, "~> 1.5"},
# {:floki, "~> 0.21.0"}
# {:benchee, "~> 1.0"} (Only for benchmarking – not in the script)
defmodule CrawlQueue do
use Agent
def start_link(urls) do
queue = :queue.from_list(urls)
Agent.start_link(fn -> queue end, name: __MODULE__)
end
def pop do
queue = Agent.get(__MODULE__, & &1)
{value, queue} = pop_value(queue)
Agent.update(__MODULE__, fn _ -> queue end)
value
end
def push(url) do
Agent.update(__MODULE__, &(:queue.in(url, &1)))
end
defp pop_value(queue) do
case :queue.out(queue) do
{{:value, value}, queue} -> {value, queue}
{:empty, queue} -> {:empty, queue}
end
end
end
defmodule Crawler do
alias CrawlQueue, as: Queue
defstruct [:pool_size, :results, :current, :pending_sieve, :sieved]
def init(seed, pool_size) do
Queue.start_link([])
crawl(%__MODULE__{pool_size: pool_size, current: [seed | []], results: []})
end
defp crawl(%{current: [], results: results}), do: results
defp crawl(struct) do
if length(struct.results) >= 500 do
# Enum.each(struct.results, &IO.puts/1)
struct.results
else
struct
|> scan_async() # scans current urls and adds results to pending_sieve
|> sieve() # cleans pending_sieve
|> push_to_queue() # pushes pending_sieve to queue and adds them as results
|> take() # take 5 from queue and add them to current
|> crawl() # reestart crawl process
end
end
defp sieve(%{pending_sieve: pending, results: results} = struct) do
sieved = filter_already_scanned_urls(pending, results)
%{struct | sieved: sieved, pending_sieve: []}
end
defp filter_already_scanned_urls(urls, scanned) do
Enum.filter(urls, fn x -> !Enum.member?(scanned, x) end)
end
defp push_to_queue(%{sieved: pending, results: results} = struct) do
Enum.each(pending, &Queue.push/1)
%{struct | sieved: [], results: results ++ pending}
end
defp take(%{pool_size: pool_size, current: urls} = struct) do
if length(urls) < pool_size do
case Queue.pop() do
url when is_binary(url) -> take(%{struct | current: [url | urls]})
_empty -> struct
end
else
struct
end
end
defp scan_async([]), do: []
defp scan_async(%{current: urls} = struct) when is_list(urls) do
urls
|> Enum.map(fn url -> Task.async(fn -> scan(url) end) end)
|> Enum.map(fn t -> Task.await(t, 15_000) end)
|> List.flatten()
|> mark_as_pending_sieve(struct)
|> clear_current()
end
defp scan(url) do
try do
url
|> HTTPoison.get!([], timeout: 15_000, recv_timeout: 15_000)
|> Map.get(:body)
|> Floki.find("* a")
|> Floki.attribute("href")
rescue
# Any error when getting/parsing will mean no results are retrieved
# from that website -> There are a lot of awkward websites out there (?)
CaseClauseError -> []
HTTPoison.Error -> []
ArgumentError -> []
end
end
defp mark_as_pending_sieve(urls, struct), do: %__MODULE__{struct | pending_sieve: urls}
defp clear_current(struct), do: %__MODULE__{struct | current: []}
end
@Manzanit0
Copy link
Author

Benchmarks:

Benchee.run(%{
    "5 workers" => fn -> Crawler.init("https://manzanit0.github.io", 5) end,
    "10 workers" => fn -> Crawler.init("https://manzanit0.github.io", 10) end,
    "20 workers" => fn -> Crawler.init("https://manzanit0.github.io", 20) end,
    "50 workers" => fn -> Crawler.init("https://manzanit0.github.io", 50) end,
    "100 workers" => fn -> Crawler.init("https://manzanit0.github.io", 100) end
  },
  time: 10,
  memory_time: 2
)

@Manzanit0
Copy link
Author

Benchmark results 1:

Name                  ips        average  deviation         median         99th %
20 workers           0.36         2.78 s    ±11.03%         2.80 s         3.14 s
5 workers            0.31         3.24 s    ±35.04%         3.13 s         4.73 s
10 workers           0.23         4.27 s    ±94.25%         2.98 s         9.88 s
50 workers           0.22         4.48 s    ±74.24%         5.26 s         7.34 s
100 workers        0.0658        15.21 s     ±0.00%        15.21 s        15.21 s

Comparison:
20 workers           0.36
5 workers            0.31 - 1.17x slower +0.46 s
10 workers           0.23 - 1.53x slower +1.49 s
50 workers           0.22 - 1.61x slower +1.70 s
100 workers        0.0658 - 5.47x slower +12.42 s

Memory usage statistics:

Name                average  deviation         median         99th %
20 workers         43.83 MB    ±37.92%       43.83 MB       55.58 MB
5 workers         104.92 MB     ±0.00%      104.92 MB      104.92 MB
10 workers         18.20 MB     ±0.00%       18.20 MB       18.20 MB
50 workers        205.63 MB     ±8.21%      205.63 MB      217.57 MB
100 workers       112.55 MB     ±0.00%      112.55 MB      112.55 MB

Comparison:
20 workers         43.83 MB
5 workers         104.92 MB - 2.39x memory usage +61.09 MB
10 workers         18.20 MB - 0.42x memory usage -25.62134 MB
50 workers        205.63 MB - 4.69x memory usage +161.80 MB
100 workers       112.55 MB - 2.57x memory usage +68.72 MB

@Manzanit0
Copy link
Author

Benchmark results 2:

Name                  ips        average  deviation         median         99th %
20 workers           0.36         2.78 s    ±11.03%         2.80 s         3.14 s
5 workers            0.31         3.24 s    ±35.04%         3.13 s         4.73 s
10 workers           0.23         4.27 s    ±94.25%         2.98 s         9.88 s
50 workers           0.22         4.48 s    ±74.24%         5.26 s         7.34 s
100 workers        0.0658        15.21 s     ±0.00%        15.21 s        15.21 s

Comparison:
20 workers           0.36
5 workers            0.31 - 1.17x slower +0.46 s
10 workers           0.23 - 1.53x slower +1.49 s
50 workers           0.22 - 1.61x slower +1.70 s
100 workers        0.0658 - 5.47x slower +12.42 s

Memory usage statistics:

Name                average  deviation         median         99th %
20 workers         43.83 MB    ±37.92%       43.83 MB       55.58 MB
5 workers         104.92 MB     ±0.00%      104.92 MB      104.92 MB
10 workers         18.20 MB     ±0.00%       18.20 MB       18.20 MB
50 workers        205.63 MB     ±8.21%      205.63 MB      217.57 MB
100 workers       112.55 MB     ±0.00%      112.55 MB      112.55 MB

Comparison:
20 workers         43.83 MB
5 workers         104.92 MB - 2.39x memory usage +61.09 MB
10 workers         18.20 MB - 0.42x memory usage -25.62134 MB
50 workers        205.63 MB - 4.69x memory usage +161.80 MB
100 workers       112.55 MB - 2.57x memory usage +68.72 MB

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment